# import packages
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import yaml

%matplotlib inline


# if you ran the pipeline for real earlier, just use ../output/{so on}
perp_fp_fmt = '../example/output/tutorial/perplexity/{sample}/vbprior={vb_prior}/beta=1e-8/{fold}/perplexity.yml'

exp_config = '../snakefiles/config.yml'
with open(exp_config, 'r') as f:
    cfg = yaml.safe_load(f)
    SAMPLES = cfg['sample-names']
    K = cfg['k']
    VB_PRIORS = [str(w) for w in cfg['prior-weights']]


def perplexity_plot(sample, vb_priors, folds):
    df = pd.DataFrame()
    for vb_prior in vb_priors:
        for fold in range(1, folds + 1):
            with open(perp_fp_fmt.format(sample=sample, vb_prior=vb_prior, fold=fold), 'r') as f:
                perp_result = yaml.safe_load(f)
            _vb_prior = float(vb_prior)
            df = df.append(dict(perplexity=perp_result['smoothed_perplexity'],
                                vb_prior=_vb_prior, log_vb_prior = np.log10(_vb_prior),
                                fold=fold), 
                           ignore_index=True)

    fig, ax = plt.subplots(1, 1, figsize=(10, 7))
    ax.set_title('Sample {}{}'.format(sample[-3], sample[-1]), fontsize=20)
    sns.lineplot(x="log_vb_prior", y="perplexity", data=df, ax=ax, units="fold", estimator=None, color='#AAAAAA')
    sns.lineplot(x="log_vb_prior", y="perplexity", data=df, ax=ax, err_style=None, lw=3, color="red")

    ax.set_ylabel('Perplexity', fontsize=20)
    ax.set_xlabel('Log VBEM prior size', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=15)

    plt.show()
    # find the log VBEM of the minimum perplexity and convert to base 10
    smallest = df.nsmallest(1, 'perplexity')
    print('The smallest perplexity is ' + str(int(smallest['perplexity'].values[0])))
    print('The VBEM at the smallest perplexity is ' + str(10**smallest['log_vb_prior'].values[0]))
    return ax

for sample in SAMPLES:
    perplexity_plot(sample, VB_PRIORS, 5)

The smallest perplexity is 1487
The VBEM at the smallest perplexity is 3.0


# if you ran the pipeline for real earlier, just use ../output/{so on}
perp_fp_fmt = '../example/output/readdepth/perplexity/{sample}/vbprior={vb_prior}/beta=1e-8/perplexity.yml'

exp_config = '../snakefiles/config-readdepth.yml'
with open(exp_config, 'r') as f:
    cfg = yaml.safe_load(f)
    # only select first 4 for now
    SAMPLES = cfg['sample-names'][:4]
    VB_PRIORS = [str(w) for w in cfg['prior-weights']]


def perplexity_plot_readdepth(sample, vb_priors):
    df = pd.DataFrame()
    for vb_prior in vb_priors:
        with open(perp_fp_fmt.format(sample=sample, vb_prior=vb_prior), 'r') as f:
            perp_result = yaml.safe_load(f)
        _vb_prior = float(vb_prior)
        df = df.append(dict(perplexity=perp_result['smoothed_perplexity'],
                            vb_prior=_vb_prior, log_vb_prior = np.log10(_vb_prior)), 
                       ignore_index=True)

    fig, ax = plt.subplots(1, 1, figsize=(10, 7))

    ax.set_title(sample, fontsize=20)
    sns.lineplot(x="log_vb_prior", y="perplexity", data=df, ax=ax, err_style=None, lw=3, color="red")

    ax.set_ylabel('Perplexity', fontsize=20)
    ax.set_xlabel('Log VBEM prior size', fontsize=20)
    ax.tick_params(axis='both', which='major', labelsize=15)

    plt.show()
    # find the log VBEM of the minimum perplexity and convert to base 10
    smallest = df.nsmallest(1, 'perplexity')
    print('The smallest perplexity is ' + str(int(smallest['perplexity'].values[0])))
    print('The VBEM at the smallest perplexity is ' + str(10**smallest['log_vb_prior'].values[0]))
    return ax

for sample in SAMPLES:
    perplexity_plot_readdepth(sample, VB_PRIORS)

The smallest perplexity is 1106
The VBEM at the smallest perplexity is 2.0

The smallest perplexity is 1096
The VBEM at the smallest perplexity is 2.0

The smallest perplexity is 1092
The VBEM at the smallest perplexity is 10.0

The smallest perplexity is 1087
The VBEM at the smallest perplexity is 4.0

How to use perplexity to select an optimal VBEM prior for RNA transcript quantification with Salmon¶

Introduction¶

Setup¶

Configuration of the Snakemake pipeline¶

Run the pipeline¶

Generate k folds¶

Tanscript quantification, cross-validation and perplexity calculation¶

Inspect the perplexity results¶

Visualize the perplexities and VBEM priors for each fold¶

Select the VBEM with the smallest perplexity¶

How does read depth impact the optimal VBEM prior?¶

Simulate reads¶

Quantify and validate¶

Plot perplexity and log VBEM for samples of varying read depth¶

Conclusion¶

Footnotes¶