step3.do.pearson.2.bygene.custom
import math
import sys
import re
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import scipy.stats
from scipy.stats import zscore
import phenograph
from operator import itemgetter
import pandas as pd
import seaborn as sns
import pick_genes_by_total
def get_unique_clusters(n):
m = []
f = open(n)
for l in f:
l = l.rstrip("\n")
m.append(l)
f.close()
return m
#def read(n, order_n, gene_n):
def read(n, order_n, gene_file=None, genes_interest=None):
if gene_file is not None:
genes_interest = []
f = open(gene_file)
for l in f:
l = l.rstrip("\n")
genes_interest.append(l)
f.close()
f = open(n)
num_gene = 0
h = f.readline().rstrip("\n").split("\t")
for l in f:
l = l.rstrip("\n")
num_gene+=1
f.close()
f = open(n)
num_col = len(h) - 1
mat = np.zeros((num_gene, num_col), dtype="float32")
f.readline()
genes = []
gene_id = 0
for l in f:
l = l.rstrip("\n")
ll = l.split("\t")
t_gene = ll[0]
genes.append(t_gene)
mat[gene_id, :] = [float(v) for v in ll[1:]]
gene_id+=1
f.close()
header = h[1:]
header = np.array(header)
m_h = {}
for t_id, val in enumerate(header):
m_h[val] = t_id
f = open(order_n)
cond = []
new_order = []
for l in f:
l = l.rstrip("\n")
cond.append(l)
new_order.append(m_h[l])
f.close()
new_order = np.array(new_order)
mat = mat[:, new_order]
gmap = {}
for ix,v in enumerate(genes):
gmap[v] = ix
good_ids = np.array([gmap[g] for g in genes_interest])
mat = mat[good_ids, :]
genes = np.array(genes)
genes = genes[good_ids]
return mat, genes, header[new_order]
def is_not_filtered(g):
if g.startswith("RPL") or g.startswith("RPS") or g.startswith("HLA-"):
return False
else:
return True
if __name__=="__main__":
choice = sys.argv[2] #AA or EA
_, _, _, gene_sum = pick_genes_by_total.read("cell.type.expr.%s.%s.more.joined.txt" % (sys.argv[1], choice), choice)
#g,g_CAF,g_T,g_Myeloid,g_Neutrophil, g_T+g_Neutrophil
if choice=="EA":
gene_sum.sort(key=itemgetter(1), reverse=True)
genes_AA = set([g[0] for g in gene_sum[:35]])
gene_sum.sort(key=itemgetter(5), reverse=True) #important
gene_sum = [g for g in gene_sum if g[0] not in genes_AA]
else:
gene_sum.sort(key=itemgetter(5), reverse=True)
genes_EA = set([g[0] for g in gene_sum[:35]])
gene_sum.sort(key=itemgetter(1), reverse=True) #important
gene_sum = [g for g in gene_sum if g[0] not in genes_EA]
g_of_interest = [g[0] for g in gene_sum if is_not_filtered(g[0])]
g_of_interest = g_of_interest[:50]
#g_of_interest = [g[0] for g in gene_sum]
#g_of_interest = ["C1QB", "COL1A2", "DCN", "COL4A2", "THY1", "IGFBP7", "FCER1G", "IGKC", "FN1", "VCAN", "PCOLCE", "FSTL1", "ITM2B", "CTSB", "S100A4"]
#g_of_interest = ["S100A9", "PLSCR1", "CD6", "CAPG", "EFHD2", "RAC2", "CXCL9", "BIRC3", "CTSC", "GZMK", "CCL5", "IL2RG", "LY6E", "CD48", "CD2", "MX1", "IFITM1"]
mat, genes, conditions = read("cell.type.expr.%s.%s.more.joined.txt" % (sys.argv[1], choice), "order.%s.query.2.txt" % choice, genes_interest=g_of_interest, gene_file=None) #good
sys.stdout.write("\t".join(conditions) + "\n")
for ig,g in enumerate(genes):
sys.stdout.write(g + "\t" + "\t".join(["%.3f" % v for v in mat[ig,:]]) + "\n")
#plt.tick_params(axis='both', which='major', labelsize=10, labelbottom = False, bottom=False, top = False, labeltop=True)
nj_union = genes
nj_union_title = []
for n in nj_union:
#n_new = n.split(" ")[0]
n_new = n
nj_union_title.append(n_new)
nt = {}
for ind,ki in enumerate(conditions):
nx = []
for ind2, g in enumerate(genes):
#es = sum_values[protein_map[g], ind]
nx.append((ki, g, mat[ind2, ind], mat[ind2, ind]))
#ki_new = ki.split(" ")[0]
ki_new = ki
nt[ki_new] = pd.Series([tx[2] for tx in nx], index=nj_union_title)
cg=sns.clustermap(pd.DataFrame(nt), row_cluster=True, col_cluster=False, \
#7,10
figsize=(7, 10), method="ward", \
#vmax=3, vmin=0, #AA \
#vmax=2, vmin=0, #AA \
vmax=0.7, vmin=0, #EA, good \
#vmax=1.0, vmin=0, #EA \
#col_ratios={"dendrogram":0.05}, \
#row_ratios={"dendrogram":0.05}, \
dendrogram_ratio=(0.1, 0.01), \
cbar_pos=None,\
#cmap="rainbow", \
cmap="Spectral_r", \
#cmap="Reds", \
yticklabels=True, xticklabels=True)
#plt.show()
#sys.exit(0)
#cg.fig.savefig("heatmap_%s.png" % (t_key))
#cg.ax_heatmap.tick_params(labeltop=True, labelbottom=False, labelleft=True, labelright=False, \
#top=True, bottom=False, left=True, right=False)
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
#save_file = "heatmap_%s.png" % (sys.argv[1])
#save_file = "heatmap_AAcoef_%s.png" % sys.argv[1]
save_file = "heatmap_%scoef_%s.png" % (choice, sys.argv[1])
#save_file = "heatmap_AAcoef.png"
#save_file = "heatmap_EAcoef_rearr.png"
#save_file = "heatmap_AAcoef_rearr.png"
cg.fig.savefig("%s" % save_file)
sys.exit(0)
'''
protein_map = {}
for ind,v in enumerate(proteins):
protein_map[v] = ind
by_category = read_case_map("pngs/sheet")
print(by_category)
'''
#print(by_case)
#sys.exit(0)
#group1 = ["Case10_ROI002", "Case10_ROI003", "Case10_ROI004"]
#group1 = ["Case10_ROI005", "Case10_ROI006", "Case10_ROI007"]
#group1 = ["Case10_ROI001", "Case10_ROI008"]
#group1 = ["Case11_ROI001", "Case11_ROI007"]
#group1 = ["Case11_ROI003", "Case11_ROI004", "Case11_ROI005", "Case11_ROI006"]
#group1 = ["Case11_ROI002"]
#group1 = ["Case1_ROI001", "Case1_ROI003", "Case1_ROI004"]
#group1 = ["Case1_ROI002", "Case1_ROI005", "Case1_ROI006"]
for t_category in by_category:
print(t_category)
group1 = by_category[t_category]
sum_values = np.zeros((len(proteins), len(proteins)), dtype="float32")
for t_key in group1:
t_case = by_case[t_key]
values = np.zeros((len(proteins), len(proteins)), dtype="float32")
for i in range(len(proteins)):
for j in range(len(proteins)):
values[i,j] = scipy.stats.pearsonr(t_case[:,i], t_case[:,j])[0]
sum_values[i,j] += 1/len(group1) * scipy.stats.pearsonr(t_case[:,i],t_case[:,j])[0]
nj_union = proteins
nj_union_title = []
for n in nj_union:
n_new = n.split("_")[2]
nj_union_title.append(n_new)
nt = {}
for ind,ki in enumerate(proteins):
nx = []
for g in nj_union:
es = sum_values[protein_map[g], ind]
nx.append((ki, g, np.mean(es), np.std(es)))
#cluster_expr[:, ki-1] = np.array([n[2] for n in nx])
ki_new = ki.split("_")[2]
nt[ki_new] = pd.Series([tx[2] for tx in nx], index=nj_union_title)
cg=sns.clustermap(pd.DataFrame(nt), row_cluster=False, col_cluster=False, \
figsize=(5, 5), method="average", \
vmax=1.0, vmin=-1.0, \
#col_ratios={"dendrogram":0.05}, \
#row_ratios={"dendrogram":0.05}, \
dendrogram_ratio=(0.1, 0.01), \
#cbar_pos=None,\
cmap="Spectral", yticklabels=True, xticklabels=True)
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
#plt.show()
#cg.fig.savefig("heatmap_%s.png" % (t_key))
save_file = "heatmap_%s_%s.png" % (t_category[0], t_category[1])
cg.fig.savefig("%s" % save_file)
Running Command
python3 step3.do.pearson.2.bygene.custom.py M1 AA
Results