Sequence Coverage Script
Below is a python script for generating a sequence coverage plot. It uses the prediction results generated by AlphaFold (features.pkl
) to visualize the multiple sequence alignment (MSA) coverage.
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--input_dir',dest='input_dir',required=True)
parser.add_argument('--name',dest='name')
parser.set_defaults(name='')
parser.add_argument('--output_dir',dest='output_dir')
parser.set_defaults(output_dir='')
args = parser.parse_args()
def generate_seq_cov_plot(name, in_dir, out_dir):
import pickle
import numpy as np
import matplotlib.pyplot as plt
feature_dict = pickle.load(open(f'{in_dir}/features.pkl','rb'))
msa = feature_dict['msa']
seqid = (np.array(msa[0] == msa).mean(-1))
seqid_sort = seqid.argsort()
non_gaps = (msa != 21).astype(float)
non_gaps[non_gaps == 0] = np.nan
final = non_gaps[seqid_sort] * seqid[seqid_sort, None]
###################### PLOT MSA WITH COVERAGE ####################
plt.figure(figsize=(8, 4), dpi=100)
plt.title(f"Sequence coverage ({name})")
plt.imshow(final,
interpolation='nearest', aspect='auto',
cmap="rainbow_r", vmin=0, vmax=1, origin='lower')
plt.plot((msa != 21).sum(0), color='black')
plt.xlim(-0.5, msa.shape[1] - 0.5)
plt.ylim(-0.5, msa.shape[0] - 0.5)
plt.colorbar(label="Sequence identity to query", )
plt.xlabel("Positions")
plt.ylabel("Sequences")
plt.savefig(f"{out_dir}/{name}_msa_sequence_coverage.pdf")
generate_seq_cov_plot(args.name, args.input_dir, args.output_dir)
The script outputs a color-coded heatmap as a PDF.