plot.select.expr.2.EA
import sys
import os
import re
import numpy as np
import scipy
import scipy.stats
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.utils.extmath import randomized_svd
from sklearn.manifold import TSNE
from scipy.stats import pearsonr
sys.setrecursionlimit(10000)
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import ttest_ind, ttest_rel
from operator import itemgetter
import pick_genes_by_total
def read_column(n):
f = open(n)
f.readline()
gx = {}
for l in f:
l = l.rstrip("\n")
ll = l.split("\t")
for ix,v in enumerate(ll):
if v=="": continue
gx.setdefault(ix, [])
gx[ix].append(v)
f.close()
return gx
def read_gsm(n):
f = open("%s" % n)
by_query = {}
q = None
queries = []
for l in f:
l = l.rstrip("\n")
if l.startswith("Query"):
q = l.split(" ", 1)[1]
queries.append(q)
by_query.setdefault(q, [])
continue
ll = l.split()
g = ll[0]
pval1 = float(ll[1])
pval2 = float(ll[2])
avgexpr_h = float(ll[3])
avgexpr_l = float(ll[4])
a_mean = float(ll[5])
a_std = float(ll[6])
#if pval1<1e-5 and pval2<1e-5: #EA
if pval1<0.05 and pval2<0.05: #AA
#if pval1<0.1 and pval2<0.1:
#by_query[q].append((g, pval1, pval2, avgexpr_h))
#by_query[q].append((g, pval1, pval2, avgexpr_h/avgexpr_l))
#by_query[q].append((g, pval1, pval2, np.log(avgexpr_h/avgexpr_l))) #EA
by_query[q].append((g, pval1, pval2, np.log(avgexpr_h/a_mean))) #mean, test #AA
#by_query[q].append((g, pval1, pval2, (avgexpr_h - a_mean)/a_std))
#by_query[q].append((g, pval1, pval2, avgexpr_h - a_mean)) #AA
#by_query[q].append((g, pval1, pval2, avgexpr_h / a_std))
f.close()
return by_query, queries
def read_list(n):
m = []
f = open(n)
for l in f:
l = l.rstrip("\n")
m.append(l)
f.close()
return m
def is_not_filtered(g):
if g.startswith("RPL") or g.startswith("RPS") or g.startswith("HLA-"):
return False
else:
return True
def is_not_gene(g, choice, qid):
if choice=="EA":
if qid==2:
if g=="PTPRC" or g=="CORO1A" or g=="CD52":
return False
return True
return True
elif choice=="AA":
if qid==1:
if g=="GNG11" or g=="RGS5" or g=="TPM1":
return False
return True
return True
if __name__=="__main__":
choice = sys.argv[2] #EA or AA
_, _, _, gene_sum = pick_genes_by_total.read("cell.type.expr.M%s.%s.more.joined.txt" % (sys.argv[1], choice), choice)
#g,g_CAF,g_T,g_Myeloid,g_Neutrophil, g_T+g_Neutrophil
if choice=="EA":
#gene_sum.sort(key=itemgetter(1), reverse=True)
#genes_AA = set([g[0] for g in gene_sum[:15]])
gene_sum.sort(key=itemgetter(5), reverse=True)
#gene_sum = [g for g in gene_sum if g[0] not in genes_AA]
else:
gene_sum.sort(key=itemgetter(5), reverse=True)
genes_EA = set([g[0] for g in gene_sum[:50]])
gene_sum.sort(key=itemgetter(1), reverse=True)
gene_sum = [g for g in gene_sum if g[0] not in genes_EA]
qid = int(sys.argv[1])
'''
if choice=="EA":
gene_sum.sort(key=itemgetter(5), reverse=True)
else:
gene_sum.sort(key=itemgetter(1), reverse=True)
'''
#g_of_interest = [g[0] for g in gene_sum[:100]]
g_of_interest = [g[0] for g in gene_sum if is_not_filtered(g[0]) and is_not_gene(g[0], choice, qid)]
g_of_interest = g_of_interest[:20] #AA, :40 works.
#g_of_interest = g_of_interest[:20]
#columns = read_column("EAcoef.filtered.list")
EA = read_list("EA.list")
#AA = read_list("AA.list.2")
AA = read_list("AA.list")
EA_list = {}
AA_list = {}
queries = []
for sample in EA:
EA_list[sample], queries = read_gsm(sample)
for sample in AA:
AA_list[sample], queries = read_gsm(sample)
#print(queries)
#sys.exit(0)
#genes_x = read_list(sys.argv[1])
#genes_x = read_list("selected.by.expr.q%s.EA.more.txt" % sys.argv[1])
#genes_x = read_list("selected.by.expr.q%s.EA.more.with.neutro.txt" % sys.argv[1])
#genes_x = columns[qid-1]
genes_x = g_of_interest
EA_higher = 0
AA_higher = 0
equal = 0
by_gene = {}
t_table = {}
t_id = 0
ax_EA = []
ax_AA = []
for ix,q in enumerate(queries):
#print("Query:", q)
by_gene_EA = {}
for sample in EA:
for g, i, j, a in EA_list[sample][q]:
by_gene_EA.setdefault(g, {})
#by_gene_EA[g].setdefault(sample, 0)
by_gene_EA[g][sample] = a
by_gene_AA = {}
for sample in AA:
for g, i, j, a in AA_list[sample][q]:
by_gene_AA.setdefault(g, {})
#by_gene_AA[g].setdefault(sample, 0)
by_gene_AA[g][sample] = a
if ix == qid-1:
for ggid, g in enumerate(genes_x):
sam_EA = []
sam_AA = []
for s in EA:
if g not in by_gene_EA or s not in by_gene_EA[g]:
sam_EA.append(0)
#sam_EA.append(1.0)
else:
sam_EA.append(by_gene_EA[g][s])
for s in AA:
if g not in by_gene_AA or s not in by_gene_AA[g]:
sam_AA.append(0)
#sam_AA.append(1.0)
else:
sam_AA.append(by_gene_AA[g][s])
'''
for sE, sA in zip(sam_EA, sam_AA):
t_table[t_id] = [g, sA, sE]
t_id+=1
'''
#sam_EA = [by_gene_EA[g][s] for s in EA]
#sam_AA = [by_gene_AA[g][s] for s in AA]
#sam_AA[:4]
avg_AA = np.mean(sam_AA)
avg_EA = np.mean(sam_EA)
sys.stdout.write(g + "\t" + str(avg_AA) + "\t" + str(avg_EA) + "\t" + str(avg_AA-avg_EA) + "\n")
#sys.stdout.write(g + "\t" + "\t".join(["%.2f" % x for x in sam_AA]) + "\t" + "\t".join(["%.2f" % x for x in sam_EA]) + "\n")
if avg_EA > avg_AA:
EA_higher+=1
elif avg_AA > avg_EA:
AA_higher+=1
else:
equal+=1
#t_table[ggid] = [g, np.exp(avg_AA), np.exp(avg_EA)]
t_table[ggid] = [g, avg_AA, avg_EA]
exp_AA = np.exp(avg_AA)
exp_EA = np.exp(avg_EA)
#ax_AA.append(exp_AA)
#ax_EA.append(exp_EA)
ax_AA.append(avg_AA)
ax_EA.append(avg_EA)
#ax_diff.append(exp_EA - exp_AA)
print(AA_higher, EA_higher, equal)
print(ttest_rel(ax_AA, ax_EA, alternative="less"))
df = pd.DataFrame.from_dict(t_table, orient="index", columns=["gene", "AA", "EA"])
dx = pd.melt(df, id_vars=["gene"], value_vars=["AA", "EA"])
plt.figure(figsize=(5,5))
plt.subplots_adjust(left=0.185, right=0.66, bottom=0.11, top=0.955, hspace=0.2, wspace=0.2)
sns.lineplot(data=dx, x="variable", y="value", hue="gene", ci=None, dashes=False, palette="gnuplot", markers=True, style="gene") #ci = None
plt.legend(bbox_to_anchor=(1.05,1), loc=2)
#plt.show()
#plt.savefig("lineplot_%s_q%d.png" % (choice, qid))
plt.savefig("lineplot_expr_z_%s_q%d.png" % (choice, qid))
#sys.exit(0)
'''
genes = set(list(by_gene_EA.keys()) + list(by_gene_AA.keys()))
gs = []
for g in genes:
#print(q, g, by_gene_EA.get(g, 0), by_gene_AA.get(g, 0))
count_EA = by_gene_EA.get(g, 0)
count_AA = by_gene_AA.get(g, 0)
size_ratio = len(AA) / len(EA)
if count_EA==0:
ratio = count_AA / len(AA) / (1/(len(EA)*2))
score = ratio * (1.0 * size_ratio +count_AA) / 2
else:
ratio = count_AA/ len(AA) / (count_EA / len(EA))
score = ratio * (count_EA * size_ratio +count_AA) / 2
if ratio>1.2 and count_AA/len(AA) >= 0.5:
#if ratio<1.2 and ratio>1 and count_AA/len(AA) >= 0.5:
gs.append((g, ratio, score, count_EA, count_AA))
gs.sort(key=itemgetter(2), reverse=True)
for g,c1,c2,c3,c4 in gs:
if g in genes_x:
#print(g, c1, c2, c3, c4)
by_gene.setdefault(g, {})
by_gene[g][ix] = c1
'''
'''
fw = open("q%d.shared2.more.txt" % (ix+1), "w")
for g,c1,c2,c3,c4 in gs:
fw.write(g + "\n")
fw.close()
'''
for g in genes_x:
if g not in by_gene: continue
sys.stdout.write(g + "\t")
for ix in range(9):
sys.stdout.write(str(by_gene[g].setdefault(ix, 0)) + "\t")
sys.stdout.write("\n")
Running Command
python3 plot.select.2.EA.py AA 1
python3 plot.select.2.EA.py EA 2
Results