diffexpr.module

import sys
import os
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_ind
from operator import itemgetter

def read_coord(n):
	f = open(n)
	Xcen = []
	cells = []
	for l in f:
		l = l.rstrip("\n")
		ll = l.split(",")
		cells.append(ll[0])
		Xcen.append((float(ll[-2]), float(ll[-1])))
	f.close()
	Xcen2 = np.empty((len(Xcen), 2), dtype="float32")
	for ind,(i,j) in enumerate(Xcen):
		Xcen2[ind, :] = [j,-1.0*i]
	Xcen = Xcen2
	return Xcen, cells

def read_expression(n):
	f = open(n)
	h = f.readline().rstrip("\n").split()
	h = [xh.replace(".", "-") for xh in h]
	num_gene = 0
	for l in f:
		l = l.rstrip("\n")
		num_gene+=1
	f.close()
	mat = np.empty((num_gene, len(h)), dtype="float32")
	f = open(n)
	f.readline()
	ig = 0
	genes = []
	for l in f:
		l = l.rstrip("\n")
		ll = l.split()
		gene = ll[0]
		values = [float(v) for v in ll[1:]]
		mat[ig,:] = values
		genes.append(gene)
		ig+=1
	f.close()
	return mat, h, genes

if __name__=="__main__":
	sample = sys.argv[1]
	mat, cells, genes = read_expression("../dir_%s/Giotto_norm_expr.txt" % sample)
	Xcen, Xcells = read_coord("../dir_%s/spatial/tissue_positions_list.csv" % sample)

	map_cell = {}
	for ic,c in enumerate(Xcells):
		map_cell[c] = ic

	good_cell_ind = np.array([map_cell[c] for c in cells])
	Xcen = Xcen[good_cell_ind, :]

	gene_lists = []
	weights = []
	gene_lists.append(("PTPRC", "VIM", "PECAM1", "AR", "FCGR3A", "FCGR3B", "CD68", "CD163"))
	weights.append((1.0, 1.0, 1.0, 1.0, 0.17, 0.17, 0.33, 0.33))

	gene_lists.append(("CTLA4", "CD8A", "CD4", "CD163", "CD68", "PTPRC", "HIF1A", "GZMB", "PLK1", "PDCD1", "CD274", "CD3E",\
	"CD3D", "CD3G"))
	weights.append((1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.3, 0.3, 0.3))

	gene_lists.append(("MKI67", "CDH1", "VEGFA", "ITGAX", "FOXP3", "KIFC1", "KRT7", "KRT17", "KRT18", "KRT76", "KRT77"))
	weights.append((1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2))

	gene_lists.append(("MPO", "HIF1A", "GZMB", "CD68", "CTLA4", "ARG1"))
	weights.append((1.0, 1.0, 1.0, 1.0, 1.0, 1.0))

	gene_lists.append(("FOXP3", "CD4", "PECAM1", "KIFC1", "CDH1"))
	weights.append((1.0, 1.0, 1.0, 1.0, 1.0))

	gene_lists.append(("CD68", "CD163", "VIM", "PTPRC", "MS4A1", "PLK1"))
	weights.append((0.5, 0.5, 1.0, 1.0, 1.0, 1.0))

	gene_lists.append(("PECAM1", "FCGR3A", "FCGR3B", "CD68", "FOXP3", "CD4", "VIM", "PTPRC"))
	weights.append((1.0, 0.25, 0.25, 0.5, 1.0, 1.0, 1.0, 1.0))

	gene_lists.append(("FCGR3A", "FCGR3B", "CD163", "CD14", "VIM", "AR", "PLK1"))
	weights.append((0.25, 0.25, 0.5, 1.0, 1.0, 1.0, 1.0))


	map_gene = {}
	for ig,g in enumerate(genes):
		map_gene[g] = ig

	dot_size = 5
	ncol = 6
	nrow = int(9*2 / ncol)
	size_factor = 3
	if 9*2%ncol>0:
		nrow+=1
	f, axn = plt.subplots(nrow, ncol, figsize=(ncol * size_factor, nrow * size_factor))
	#plt.subplots_adjust(hspace=0.01, wspace=0.01)
	plt.subplots_adjust(hspace=0.1, wspace=0.1)
	ct = 0
	for ind,aG in enumerate(gene_lists):
		gene_ids = []
		w_ids = []
		for inx,gx in enumerate(aG):
			if not gx in map_gene:
				continue
			gene_ids.append(map_gene[gx])
			w_ids.append(inx)
		w_ids = np.array(w_ids)
		gene_ids = np.array(gene_ids)
		t_weight = np.array(weights[ind])[w_ids]

		avg = np.average(mat[gene_ids,:], weights=t_weight, axis=0)
		#avg = np.mean(mat[gene_ids, :], axis=0)

		v1x = np.percentile(avg, 90)
		m2 = np.where(avg>v1x)[0]

		v1x = np.percentile(avg, 10)
		m2x = np.where(avg<=v1x)[0]

		print("Query:", " ".join(aG))
		aList = []

		for ig,g in enumerate(genes):
			if sum(mat[ig,m2])==0 or sum(mat[ig,m2x])==0:
				continue
			t2 = ttest_ind(mat[ig,m2], mat[ig,m2x], equal_var=False)
			t2_ks = mannwhitneyu(mat[ig,m2], mat[ig,m2x])

			if t2[0]>0 and t2[1]<0.05 and t2_ks[1]<0.05:
				#print(g, t2[1], t2_ks[1])
				aList.append((ig, g, t2[1], t2_ks[1]))
		aList.sort(key=itemgetter(2), reverse=False)
		for ig,g,j,k in aList:
			print(g, j, k, np.mean(mat[ig,m2]), np.mean(mat[ig,m2x]), np.mean(mat[ig,:]))
		

Running Command

python3 diffexpr.module.py

Results

Query: PTPRC VIM PECAM1 AR FCGR3A FCGR3B CD68 CD163
VIM 5.758939560746751e-102 5.60511765796084e-48 3.4079967 0.098993964 1.3349795
COL1A1 3.864652162745244e-83 8.392050875510012e-46 4.4381433 0.8394257 1.8453412
COL1A2 9.616054897600655e-80 2.730398831074907e-44 4.4184346 0.6233368 1.7685285
HLA-B 2.4678366027904186e-77 4.831162237969237e-43 3.3376515 1.0402361 1.7852688
COL3A1 1.137986496615667e-74 1.3306070075135856e-44 3.7037797 0.33060858 1.2484194
SPARC 4.706649318680122e-74 4.776126276408928e-44 3.4254394 0.42126575 1.2508768
HLA-A 4.190522120912639e-66 4.186431275929642e-40 3.0472841 1.1122996 1.729778
B2M 9.962084335998903e-64 7.832971011172612e-42 4.0291443 2.2660863 2.8000264
FN1 1.440785435299462e-63 9.018066661048086e-42 4.2459993 1.782621 2.754173
BGN 1.688966568540009e-63 2.948545580822197e-41 2.0247881 0.21093556 0.6704553
COL6A1 7.475147595865111e-61 1.4666639694194973e-42 2.2068079 0.2711836 0.7156327
COL6A2 1.1424802316442945e-60 2.2472873007840957e-40 2.2976284 0.24433999 0.72942185
TIMP2 3.7998777751878596e-60 6.423379281802069e-39 1.7416278 0.26670942 0.670993
NNMT 6.233349737193785e-59 4.926997454245028e-39 1.9940393 0.4969321 1.0961503
COL6A3 8.10816360435328e-59 1.3868201205530176e-43 1.7237471 0.11531009 0.47272012
LGALS1 2.339861291443835e-56 3.3953261720952486e-38 2.6331992 0.8006768 1.1988554