original can be found here: https://colab.research.google.com/drive/1moIyNo64fVK1TNcZQTMMyE20OqHqDf-3?usp=sharing
#@title imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
! pip install cyvcf2 # not standard, need to install it first
import cyvcf2
import matplotlib.gridspec as gridspec # more plotting utilities
from matplotlib.colors import LinearSegmentedColormap # yet another plotting utility
import numpy as np
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: cyvcf2 in /usr/local/lib/python3.9/dist-packages (0.30.18) Requirement already satisfied: coloredlogs in /usr/local/lib/python3.9/dist-packages (from cyvcf2) (15.0.1) Requirement already satisfied: click in /usr/local/lib/python3.9/dist-packages (from cyvcf2) (8.1.3) Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from cyvcf2) (1.22.4) Requirement already satisfied: humanfriendly>=9.1 in /usr/local/lib/python3.9/dist-packages (from coloredlogs->cyvcf2) (10.0)
#@title mount drive
#this is for loading the VCF file that i have in my Gdrive. but you can also drag and drop it into the filesystem on the left
from google.colab import drive
try:
drive.mount('/content/drive')
root_path = "/content/drive/MyDrive/proj_2_colabs"
except:
print("Google Colab not detected.")
Mounted at /content/drive
#@title function to parse the VCF into a genotype matrix ########################################
def load_vcf(input_vcf, threads=1, aaf_thresh=0.0): # the function "load_vcf" has arguments, namely the name of the input_vcf,
# how many threads to use ( computing)
# and a threshold for alternative allele frequency the default threshold is 0.0
""" load a vcf """
vcf = cyvcf2.VCF(input_vcf,threads=threads) # load the vcf
gts = [] #init empty list for genotype_entries
aaf = [] #init empty list for alternative allele frequencies
chr_pos = [] #init empty list for positions
for variant in vcf: # for each variant/position, do:
if variant.aaf > aaf_thresh: # if the alt. allele freq. is above threshold:
gts.append(variant.gt_types.astype(int)) # append genotype array to gts
chr_pos.append(variant.POS) # append position to position-list
aaf.append(variant.aaf) # append aaf to alt. allele. freq. list
gt_array = np.array(gts) # make list of per-position arrays into SAMPLE x POS rectangular matrix
samples = vcf.samples # extract list of sample names from vcf
return aaf, [chr_pos, samples, gt_array] # return aaf, and genotype matrix with column and row names
#@title function to make a rolling mean for the allele frequencies ##############################
def make_sliding_aaf_mean(aaf,winsize=10):
"""takes array and windowsize, default 10 SNPs """
rmean = np.convolve(aaf, np.ones((winsize,))/winsize, mode='valid') # https://en.wikipedia.org/wiki/Convolution
return rmean
#@title run
aaf, gt_array = load_vcf('/content/drive/MyDrive/Mt_h37rv_2021_2.vcf') # load the vcf, extract gt_matrix and alt. allele freq.
rmean = make_sliding_aaf_mean(aaf) # calculate alt. allele. freq sliding mean
figure = plt.figure(constrained_layout=True, figsize=(20,20)) # create the figure with a size of 20x20
gs = figure.add_gridspec(2, 1, height_ratios=[10,1]) # make a grid of 2x1 subplots,
# i.e. two plots on top of each other
# with a height ratio of 10:1
figure_ax1 = figure.add_subplot(gs[0]) # ax1 is the top plot
figure_ax2 = figure.add_subplot(gs[1]) # ax2 is the smaller bottom plot
cmap = ['white', 'black'] # make a simple white/black colormap
A = pd.DataFrame(gt_array[2]).T.replace(3, np.nan)
A.index = list(gt_array[1])
A.columns= list(gt_array[0]) # plot the heatmap on the big-plot
#row_labels=gt_array[1], # add labels
#col_labels=gt_array[0], # add labels
# define the colormap, here we only want Black/White
sns.heatmap(A, cbar=False, ax=figure_ax1, cmap=cmap) # plot on ax1
figure_ax1.set_ylabel("Samples", size=20)
figure_ax1.set_title("Variants", size=30)
figure_ax2.plot(rmean, color="Black") # plot the rolling mean on ax2
figure_ax2.set_xlim(0,146817) # set limits for the x-axis
figure_ax2.set_ylim(0,0.5) # set limits for the y-axis
figure_ax2.get_xaxis().set_ticks([]) # hide x-tick-labels
figure_ax1.set_xticks([0,146817]) # set one tick at the beginning, one at the end
figure_ax1.set_xticklabels(["\"Beginning\"", "\"End\""], rotation=0) # ignoring col_labels, merely plotting "beginning" and "end"
figure_ax2.set_ylabel("AAF", size=20)
plt.show() # show the plot