diffexpr.module
import sys
import os
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind
from operator import itemgetter
def read_coord(n):
f = open(n)
Xcen = []
cells = []
for l in f:
l = l.rstrip("\n")
ll = l.split(",")
cells.append(ll[0])
Xcen.append((float(ll[-2]), float(ll[-1])))
f.close()
Xcen2 = np.empty((len(Xcen), 2), dtype="float32")
for ind,(i,j) in enumerate(Xcen):
Xcen2[ind, :] = [j,-1.0*i]
Xcen = Xcen2
return Xcen, cells
def read_expression(n):
f = open(n)
h = f.readline().rstrip("\n").split()
h = [xh.replace(".", "-") for xh in h]
num_gene = 0
for l in f:
l = l.rstrip("\n")
num_gene+=1
f.close()
mat = np.empty((num_gene, len(h)), dtype="float32")
f = open(n)
f.readline()
ig = 0
genes = []
for l in f:
l = l.rstrip("\n")
ll = l.split()
gene = ll[0]
values = [float(v) for v in ll[1:]]
mat[ig,:] = values
genes.append(gene)
ig+=1
f.close()
return mat, h, genes
if __name__=="__main__":
sample = sys.argv[1]
mat, cells, genes = read_expression("../dir_%s/Giotto_norm_expr.txt" % sample)
Xcen, Xcells = read_coord("../dir_%s/spatial/tissue_positions_list.csv" % sample)
map_cell = {}
for ic,c in enumerate(Xcells):
map_cell[c] = ic
good_cell_ind = np.array([map_cell[c] for c in cells])
Xcen = Xcen[good_cell_ind, :]
gene_lists = []
weights = []
gene_lists.append(("PTPRC", "VIM", "PECAM1", "AR", "FCGR3A", "FCGR3B", "CD68", "CD163"))
weights.append((1.0, 1.0, 1.0, 1.0, 0.17, 0.17, 0.33, 0.33))
gene_lists.append(("CTLA4", "CD8A", "CD4", "CD163", "CD68", "PTPRC", "HIF1A", "GZMB", "PLK1", "PDCD1", "CD274", "CD3E",\
"CD3D", "CD3G"))
weights.append((1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.3, 0.3, 0.3))
gene_lists.append(("MKI67", "CDH1", "VEGFA", "ITGAX", "FOXP3", "KIFC1", "KRT7", "KRT17", "KRT18", "KRT76", "KRT77"))
weights.append((1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2))
gene_lists.append(("MPO", "HIF1A", "GZMB", "CD68", "CTLA4", "ARG1"))
weights.append((1.0, 1.0, 1.0, 1.0, 1.0, 1.0))
gene_lists.append(("FOXP3", "CD4", "PECAM1", "KIFC1", "CDH1"))
weights.append((1.0, 1.0, 1.0, 1.0, 1.0))
gene_lists.append(("CD68", "CD163", "VIM", "PTPRC", "MS4A1", "PLK1"))
weights.append((0.5, 0.5, 1.0, 1.0, 1.0, 1.0))
gene_lists.append(("PECAM1", "FCGR3A", "FCGR3B", "CD68", "FOXP3", "CD4", "VIM", "PTPRC"))
weights.append((1.0, 0.25, 0.25, 0.5, 1.0, 1.0, 1.0, 1.0))
gene_lists.append(("FCGR3A", "FCGR3B", "CD163", "CD14", "VIM", "AR", "PLK1"))
weights.append((0.25, 0.25, 0.5, 1.0, 1.0, 1.0, 1.0))
map_gene = {}
for ig,g in enumerate(genes):
map_gene[g] = ig
dot_size = 5
ncol = 6
nrow = int(9*2 / ncol)
size_factor = 3
if 9*2%ncol>0:
nrow+=1
f, axn = plt.subplots(nrow, ncol, figsize=(ncol * size_factor, nrow * size_factor))
#plt.subplots_adjust(hspace=0.01, wspace=0.01)
plt.subplots_adjust(hspace=0.1, wspace=0.1)
ct = 0
for ind,aG in enumerate(gene_lists):
gene_ids = []
w_ids = []
for inx,gx in enumerate(aG):
if not gx in map_gene:
continue
gene_ids.append(map_gene[gx])
w_ids.append(inx)
w_ids = np.array(w_ids)
gene_ids = np.array(gene_ids)
t_weight = np.array(weights[ind])[w_ids]
avg = np.average(mat[gene_ids,:], weights=t_weight, axis=0)
#avg = np.mean(mat[gene_ids, :], axis=0)
v1x = np.percentile(avg, 90)
m2 = np.where(avg>v1x)[0]
v1x = np.percentile(avg, 10)
m2x = np.where(avg<=v1x)[0]
print("Query:", " ".join(aG))
aList = []
for ig,g in enumerate(genes):
if sum(mat[ig,m2])==0 or sum(mat[ig,m2x])==0:
continue
t2 = ttest_ind(mat[ig,m2], mat[ig,m2x], equal_var=False)
t2_ks = mannwhitneyu(mat[ig,m2], mat[ig,m2x])
if t2[0]>0 and t2[1]<0.05 and t2_ks[1]<0.05:
#print(g, t2[1], t2_ks[1])
aList.append((ig, g, t2[1], t2_ks[1]))
aList.sort(key=itemgetter(2), reverse=False)
for ig,g,j,k in aList:
print(g, j, k, np.mean(mat[ig,m2]), np.mean(mat[ig,m2x]), np.mean(mat[ig,:]))
Running Command
python3 diffexpr.module.py
Results
Query: PTPRC VIM PECAM1 AR FCGR3A FCGR3B CD68 CD163
VIM 5.758939560746751e-102 5.60511765796084e-48 3.4079967 0.098993964 1.3349795
COL1A1 3.864652162745244e-83 8.392050875510012e-46 4.4381433 0.8394257 1.8453412
COL1A2 9.616054897600655e-80 2.730398831074907e-44 4.4184346 0.6233368 1.7685285
HLA-B 2.4678366027904186e-77 4.831162237969237e-43 3.3376515 1.0402361 1.7852688
COL3A1 1.137986496615667e-74 1.3306070075135856e-44 3.7037797 0.33060858 1.2484194
SPARC 4.706649318680122e-74 4.776126276408928e-44 3.4254394 0.42126575 1.2508768
HLA-A 4.190522120912639e-66 4.186431275929642e-40 3.0472841 1.1122996 1.729778
B2M 9.962084335998903e-64 7.832971011172612e-42 4.0291443 2.2660863 2.8000264
FN1 1.440785435299462e-63 9.018066661048086e-42 4.2459993 1.782621 2.754173
BGN 1.688966568540009e-63 2.948545580822197e-41 2.0247881 0.21093556 0.6704553
COL6A1 7.475147595865111e-61 1.4666639694194973e-42 2.2068079 0.2711836 0.7156327
COL6A2 1.1424802316442945e-60 2.2472873007840957e-40 2.2976284 0.24433999 0.72942185
TIMP2 3.7998777751878596e-60 6.423379281802069e-39 1.7416278 0.26670942 0.670993
NNMT 6.233349737193785e-59 4.926997454245028e-39 1.9940393 0.4969321 1.0961503
COL6A3 8.10816360435328e-59 1.3868201205530176e-43 1.7237471 0.11531009 0.47272012
LGALS1 2.339861291443835e-56 3.3953261720952486e-38 2.6331992 0.8006768 1.1988554