select.expr.Tcell.exh.module.bubble.dec27
import sys
import os
import re
import numpy as np
from operator import itemgetter
from scipy.stats import ttest_ind, ks_2samp, mannwhitneyu, kruskal
import matplotlib.pyplot as plt
def read_gsm(n):
f = open(n)
by_query = {}
q = None
queries = []
for l in f:
l = l.rstrip("\n")
if l.startswith("Query"):
q = l.split(" ", 1)[1]
queries.append(q)
by_query.setdefault(q, [])
continue
ll = l.split()
g = ll[0]
pval1 = float(ll[1])
pval2 = float(ll[2])
avgexpr_h = float(ll[3])
avgexpr_l = float(ll[4])
a_mean = float(ll[5])
by_query[q].append((g, pval1, pval2, avgexpr_h - a_mean))
f.close()
return by_query, queries
def read_list(n):
m = []
f = open(n)
for l in f:
l = l.rstrip("\n")
m.append(l)
f.close()
return m
if __name__=="__main__":
EA = read_list("../curated.ea.list")
AA = read_list("../curated.aa.list")
EA_list = {}
AA_list = {}
queries = []
for sample in EA:
EA_list[sample], queries = read_gsm(sample)
for sample in AA:
AA_list[sample], queries = read_gsm(sample)
genes_x = ["CD8A", "CD3D", "TRBC2", "LAG3", "HAVCR2", "CXCL13", "GZMB", "BATF", "CCL3", "CSF1", "TIGIT"]
print(queries)
by_gene = {}
percent_expressed = np.empty((len(genes_x), 2), dtype="float32")
scaled_expression = np.empty((len(genes_x), 2), dtype="float32")
for qind, qid in enumerate([2]):
for ix,q in enumerate(queries):
by_gene_EA = {}
for sample in EA:
for g, i, j, a in EA_list[sample][q]:
by_gene_EA.setdefault(g, {})
by_gene_EA[g].setdefault(sample, 0)
by_gene_EA[g][sample] = a
by_gene_AA = {}
for sample in AA:
for g, i, j, a in AA_list[sample][q]:
by_gene_AA.setdefault(g, {})
by_gene_AA[g].setdefault(sample, 0)
by_gene_AA[g][sample] = a
if ix == qid-1:
for gi,g in enumerate(genes_x):
sam_EA = []
sam_AA = []
nonzero_EA = 0
nonzero_AA = 0
for s in EA:
if g not in by_gene_EA or s not in by_gene_EA[g]:
sam_EA.append(0)
else:
sam_EA.append(by_gene_EA[g][s])
nonzero_EA+=1
for s in AA:
if g not in by_gene_AA or s not in by_gene_AA[g]:
sam_AA.append(0)
else:
sam_AA.append(by_gene_AA[g][s])
nonzero_AA+=1
percent_expressed[gi, qind*2] = nonzero_AA/len(sam_AA)
percent_expressed[gi, qind*2+1] = nonzero_EA/len(sam_EA)
total_expr = np.sum(sam_EA) + np.sum(sam_AA)
total_len = len(sam_EA) + len(sam_AA)
scaled_expression[gi, qind*2] = np.mean(np.array(sam_AA) / total_expr * total_len)
scaled_expression[gi, qind*2+1] = np.mean(np.array(sam_EA) / total_expr * total_len)
print(np.mean(np.array(sam_AA) / (total_expr/(len(sam_EA) + len(sam_AA)))))
print(np.mean(np.array(sam_EA) / (total_expr/(len(sam_EA) + len(sam_AA)))))
print(g + "\t" + "mean" + "\t" + str(np.mean(sam_AA)) + "\t" + str(np.mean(sam_EA)) + "\n")
sys.stdout.write(g + "\t" + "\t".join(["%.5f" % x for x in sam_AA]) + "\t|" + "\t".join(["%.5f" % x for x in sam_EA]) + "\n")
sys.stdout.write(g + "\t" + str(np.std(sam_AA)) + "\t" + str(np.std(sam_EA)) + "\n")
# Create a figure and a grid of subplots
fig, ax = plt.subplots()
# Create the bubble plot with circular markers
for i in range(percent_expressed.shape[0]):
for j in range(percent_expressed.shape[1]):
size = percent_expressed[i,j] * 100
#color = 'red' if fold_changes[i, j] > 0 else 'blue'
print(genes_x[i], j, size, scaled_expression[i,j])
ax.scatter(j, i, s=size, c=scaled_expression[i,j], cmap="rainbow", vmin=0, vmax=1.74, marker='o') # Ensure marker is 'o' for circles, vmax=0.06
# Setting the plot limits
ax.set_xlim(-0.5, percent_expressed.shape[1]-0.5)
ax.set_ylim(-0.5, percent_expressed.shape[0]-0.5)
# Invert the y-axis to have the first row at the top in the plot
ax.invert_yaxis()
# Show plot
plt.show()
Running Command
python3 select.expr.Tcell.exh.module.bubble.dec27.py
Results