Basic Jupyter Notebook to plot a simple VCF file.¶

original can be found here: https://colab.research.google.com/drive/1moIyNo64fVK1TNcZQTMMyE20OqHqDf-3?usp=sharing

In [12]:
#@title imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
! pip install cyvcf2 # not standard, need to install it first
import cyvcf2
import matplotlib.gridspec as gridspec   # more plotting utilities
from matplotlib.colors import LinearSegmentedColormap # yet another plotting utility
import numpy as np
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: cyvcf2 in /usr/local/lib/python3.9/dist-packages (0.30.18)
Requirement already satisfied: coloredlogs in /usr/local/lib/python3.9/dist-packages (from cyvcf2) (15.0.1)
Requirement already satisfied: click in /usr/local/lib/python3.9/dist-packages (from cyvcf2) (8.1.3)
Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from cyvcf2) (1.22.4)
Requirement already satisfied: humanfriendly>=9.1 in /usr/local/lib/python3.9/dist-packages (from coloredlogs->cyvcf2) (10.0)
In [14]:
#@title mount drive
#this is for loading the VCF file that  i have in my Gdrive. but you can also drag and drop it into the filesystem on the left
from google.colab import drive
try:
  drive.mount('/content/drive')
  root_path = "/content/drive/MyDrive/proj_2_colabs"
except:
  print("Google Colab not detected.")
Mounted at /content/drive
In [37]:
#@title function to parse the VCF into a genotype matrix ########################################
def load_vcf(input_vcf, threads=1, aaf_thresh=0.0):          # the function "load_vcf" has arguments, namely the name of the input_vcf,
                                                             # how many threads to use ( computing)
                                                             #  and a threshold for alternative allele frequency the default threshold is 0.0
    """ load a vcf """
    vcf = cyvcf2.VCF(input_vcf,threads=threads) # load the vcf
    gts = []                                                 #init empty list for genotype_entries
    aaf = []                                                 #init empty list for alternative allele frequencies
    chr_pos = []                                             #init empty list for positions

    for variant in vcf:                                      # for each variant/position, do:
        if variant.aaf > aaf_thresh:                         # if the alt. allele freq. is above threshold:
            gts.append(variant.gt_types.astype(int))         # append genotype array to gts
            chr_pos.append(variant.POS)                      # append position to position-list
            aaf.append(variant.aaf)                          # append aaf to alt. allele. freq. list

    gt_array = np.array(gts)                                 # make list of per-position arrays into SAMPLE x POS rectangular matrix
    samples = vcf.samples                                    # extract list of sample names from vcf

    return aaf, [chr_pos, samples, gt_array]                 # return aaf, and genotype matrix with column and row names
In [38]:
#@title function to make a rolling mean for the allele frequencies ##############################
def make_sliding_aaf_mean(aaf,winsize=10):
    """takes array and windowsize, default 10 SNPs """
    rmean = np.convolve(aaf, np.ones((winsize,))/winsize, mode='valid')  # https://en.wikipedia.org/wiki/Convolution
    return rmean
In [63]:
#@title run
aaf, gt_array = load_vcf('/content/drive/MyDrive/Mt_h37rv_2021_2.vcf')        # load the vcf, extract gt_matrix and alt. allele freq.
rmean = make_sliding_aaf_mean(aaf)                                            # calculate alt. allele. freq sliding mean
In [73]:
figure = plt.figure(constrained_layout=True, figsize=(20,20)) # create the figure with a size of 20x20
gs = figure.add_gridspec(2, 1, height_ratios=[10,1])          # make a grid of 2x1 subplots,
                                                              # i.e. two plots on top of each other
                                                              # with a height ratio of 10:1
figure_ax1 = figure.add_subplot(gs[0])                        # ax1 is the top plot
figure_ax2 = figure.add_subplot(gs[1])                        # ax2 is the smaller bottom plot

cmap =  ['white', 'black'] # make a simple white/black colormap
A = pd.DataFrame(gt_array[2]).T.replace(3, np.nan)

A.index = list(gt_array[1]) 
A.columns= list(gt_array[0])                           # plot the heatmap on the big-plot
#row_labels=gt_array[1],                               # add labels
#col_labels=gt_array[0],                               # add labels
                                       # define the colormap, here we only want Black/White
sns.heatmap(A, cbar=False, ax=figure_ax1, cmap=cmap)                                        # plot on ax1
figure_ax1.set_ylabel("Samples", size=20)
figure_ax1.set_title("Variants", size=30)
figure_ax2.plot(rmean, color="Black")                         # plot the rolling mean on ax2
figure_ax2.set_xlim(0,146817)                                   # set limits for the x-axis
figure_ax2.set_ylim(0,0.5)                                    # set limits for the y-axis

figure_ax2.get_xaxis().set_ticks([])                          # hide x-tick-labels

figure_ax1.set_xticks([0,146817])                                # set one tick at the beginning, one at the end

figure_ax1.set_xticklabels(["\"Beginning\"", "\"End\""], rotation=0)        # ignoring col_labels, merely plotting "beginning" and "end"

figure_ax2.set_ylabel("AAF", size=20)
plt.show()                                                    # show the plot