Skip to content

Commit c3b5fe8

Browse files
committed
initial push
1 parent afaaa0f commit c3b5fe8

7 files changed

Lines changed: 327228 additions & 1 deletion

ENCFF068NRZ.tsv

Lines changed: 59527 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,8 @@
1-
# klf_manuscript_code
1+
# KLF Manuscript Code
2+
3+
## Authors
4+
- Julia Schaepe @juliaschaepe
5+
- Ben Doughty @bdoughty
6+
7+
## Overview
8+
This repository contains scripts for calling single-molecule binding states from methylation data (classify_single_molecule_binding_v4.py) and fitting the partition function model (partition_function_model_functions.py), both adapted from https://github.com/GreenleafLab/amplicon-smf. This repository also contains the script for taking ENCODE K562 RNA-seq data and plotting TF expression levels (RNA-seq_analysis.ipynb).

RNA-seq_analysis.ipynb

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import numpy as np\n",
10+
"import matplotlib.pyplot as plt\n",
11+
"import pandas as pd\n",
12+
"import seaborn as sns"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": 3,
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"t2g = pd.read_csv('t2gmap.csv', header=0)\n",
22+
"encode = pd.read_csv('ENCFF068NRZ.tsv', sep='\\t', header=0)"
23+
]
24+
},
25+
{
26+
"cell_type": "code",
27+
"execution_count": 4,
28+
"metadata": {},
29+
"outputs": [],
30+
"source": [
31+
"for index, row in encode.iterrows():\n",
32+
" target_id = encode.iloc[index]['gene_id']\n",
33+
" encode.at[index, 'gene_id'] = target_id.split('.')[0]"
34+
]
35+
},
36+
{
37+
"cell_type": "code",
38+
"execution_count": 5,
39+
"metadata": {},
40+
"outputs": [],
41+
"source": [
42+
"target_ids = []\n",
43+
"for i in range(18):\n",
44+
" tf = 'KLF' + str(i+1)\n",
45+
" if tf == 'KLF13':\n",
46+
" target_ids.append(t2g[t2g['ext_gene']==tf]['ens_gene'].to_list()[2])\n",
47+
" else: \n",
48+
" target_ids.append(t2g[t2g['ext_gene']==tf]['ens_gene'].to_list()[0])"
49+
]
50+
},
51+
{
52+
"cell_type": "code",
53+
"execution_count": 6,
54+
"metadata": {},
55+
"outputs": [
56+
{
57+
"name": "stderr",
58+
"output_type": "stream",
59+
"text": [
60+
"<ipython-input-6-b06263ed169c>:3: SettingWithCopyWarning: \n",
61+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
62+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
63+
"\n",
64+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
65+
" subset['ext_gene']=subset['gene_id'].map(t2g_dict)\n"
66+
]
67+
}
68+
],
69+
"source": [
70+
"t2g_dict = dict(zip(t2g['ens_gene'], t2g['ext_gene']))\n",
71+
"subset = encode[encode['gene_id'].isin(target_ids)]\n",
72+
"subset['ext_gene']=subset['gene_id'].map(t2g_dict)\n",
73+
"subset = subset.sort_values(by='ext_gene')\n",
74+
"subset['klf-number']=[1,10,11,12,13,14,15,16,17,18,2,3,4,5,6,7,8,9]\n",
75+
"subset = subset.sort_values(by='klf-number')"
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": 7,
81+
"metadata": {},
82+
"outputs": [],
83+
"source": [
84+
"lower_bound = np.asarray(subset['TPM']-subset['TPM_ci_lower_bound'])\n",
85+
"lower_bound = lower_bound*(lower_bound>0)\n",
86+
"upper_bound = np.asarray(subset['TPM_ci_upper_bound']-subset['TPM'])\n",
87+
"upper_bound = upper_bound*(upper_bound>0)"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 8,
93+
"metadata": {},
94+
"outputs": [
95+
{
96+
"data": {
97+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAALoAAABkCAYAAAAxOiquAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAIkElEQVR4nO3df4wcZR3H8ffXWKhCaAVqsaBeqWdNzoCtxB80BFFrj7tD+4eg1qSCiSU9SbRFkWDVFfFHQ1oLxm4kNUKk/EGCVnttLjaaHk00sSU9osQfZ2mLeAq9WBEL0tZ+/OOZ2+zNze5db2d3Z3e+r2Sy3Xlmnnlu9jvPPn2enXlMEs61u1c1uwDONYIHussFD3SXCx7oLhc80F0ueKC7XGhqoHd3dwvwxZc0l0RNDfSxsbFmHt7liDddXC5kOtALhQJmVloKhUKzi+Ra1KubXYBKisUi8+fPp7OzE4B169Y1uUSulWW6RncuLR7oLhcyHegDAwOMjIwwMjJCf38/AwMDzS6Sa1GZbaMD9PX10dfX1+xiuDaQ6RrdubSkUqObWRewAlgIHAAuBs6TdHca+TtXq1RqdElPAc8BlwHvkLQJwMzmxrc1szVmdsDMDhw7diyNwzs3pdSaLpK2A9uAjvFVFbZ7QNJVkq6aN29eWod3rqq0mi7dwBXAImCHmd0OIOlfaeTvXK1SCXRJg8BgGnk5Vw/e6+JywQPd5YIHussFD3SXCx7oLhc80F0ueKC7XPBAd7ngge5ywQPd5YIHussFD3SXCx7oLhcq/nrRzIaBvcAZAEnrq2x7DXA10An8inCn0RzgS/K5Y1wGVKvRPwMcB44AX62WiaR9kjZG294o6ZvA74Er49v6HUauGaoF+jDwF8LtcRdMlZGZrQIOE4K9Ir/DyDVDtUD/I/Be4G/AR6tlYmY3AauBecCwmd0FdAFPplRO52pS7Q6j9wOzgFNTZSLpUeDRtArlXNqqBfoXo9dhSdsaURjn6qVa0+VpSbcxjfa5c1lXLdC/YGabgaXRq3Mtq1rT5X5gK2ANKotzdVMt0K8FziEEugB/vJxrWdUCfVDSfQ0riXN1VK2N/v2GlaIOfP4jV65ijS7pdCMLkiaf/8jFte2vF322DFcu0zNe1MJny3Dl2rZGd66cB7rLBQ90lwtpTQTwVuAuYAdwEliC32HkMiStOYz+DDwYvV3udxi5rGl406Ud7jDK6mBUVsuVBWk1XS4h3IX0GuDx6A6jOcCP08g/awqFAnv37gUovWZBVsuVBWnNYfQP4LY08sq6YrEIwOjo6IT3a9eubVqZysuRtXJlhfe6zICPuraeth0ZradmjLoWi0UGBgbYvXt3aV1PTw+7du0qvR+/AAH6+/vp6enxGj3igd5G/GcPlXmgtxAP5JnzNrrLBQ/0jCgWi/T29k7oB+/t7W12sdqGB7oD2n+wydvoGdLMNni7DzZ5je5KzaahoSGGhoZSbzZl4dvCa3QH1O/bpFgssn///gnr4u8bwQPd1V0WukW96eJq0iq9RakHupktNbN7zGyzmZ2Xdv71VusHl4X2qJusHjX6J4AC4W6j5XXIvybFYpHOzs4JwTj+/Jc08p6qPdqOF0JfXx9bt24tLWfTTGnU+bC073Qzs3uBO4FlwIWSdsTS1wBroreLgT9NI9uLgbHYusXA+WXv/1Mhr6R9p5NWa3qltAXAG8re/x0YbUC5ajlfaZzrSulTnY/pHnvcmKTuSWslpboA7wS+AWwCzk8pzwMzSavnvl6ubOU91ZJ6r4ukJ4An0s7XuVp4r4vLhVYJ9AdmmFbPfeuZt5fr7NOrSv0/o85lUavU6M7VJNM/ASh/Aphi3ZRR+jXA1UAncIekf5aldQErgIXA1yVN6vYys17gVkkfTkhbCVxHmA37PpV99ZnZQuBThK6uH0h6MbbvtYSnla0EbpZ0pCztBsJExXOBTZIOxfa9HngXcBFwp6SX4ueC2NPQor9/PG00+ndB0nDCvqeBtwOXA5+VdCqW/lx0TjuAddF2pc/AzFYD10m6JSHvjqhMz0r6YUL6s4TP5GVJm2Np/wMWAR8DPijpRCx9PnAh8BZgvaQXOAuZrtE18QlgSen7JG0kTMs+N5b2FOFDu4yESYHNbAkwG3i6QvYngBcJkwrHz9OtwPhFNSlvSUPA94A/lAd55GXCB/Za4PmE43YD9xL6iksDbqryNLTyNEm/JQQGSftKGpD0HeAVwhxV8fTfRH/T64Ez5Wlmthw4CryQlDdwnHARzq6Q/ulo39NmZrHj7gQeAvZJOpGw70uEmckB/p1w3qrKdKBPh5mtAg5LmhSwkrYD24A3Jex6PfBGYEkU9PF990jaABwiTFxWbjbwS+B3QKXfB6wEfp6wfjHhGTg/SsgXwgXSD7yNaczaPRNm9jlg53hAxUm6H9hDuCDLfYDwTbLEzBYl7PeQpG8D55rZ5QlZX0II3FeApQnptxDOS5KLJK0HHge6KmxTUdabLqUngJnZQUlHY+k3AauBQTN7c3m6mXUDVxC+Dr8Wz1vSt6LtOiQdTDj2+4B3E5o+X44lPxgddxZhYCzJCkLNHzcGbCB8xX83If3c6PV54Bdl5an4NLTYeToOfAjoMrOjko7H0q8kXERmZvsT0ucQavNOYHvsuHdLOhqds0PxckX7LgAuJTRT4uX+GXA78Lp43mZ2EFgsqTSnbWzfZ8zsDsI39E8qnPOKvNfF5ULLN12cmw4PdJcLHuguFzzQXS5kutel1ZnZzcAw8B5C3+85wHDZQM4Q8FPg11H/d03HGc/XTeaBXn8bCCOr+6KATFKa48bMHgF2Acsk9ZvZFkmfN7MthIGgVYTBrDFCV+ST0a6rzOxGQt/9pYQuxAuAjcDDwCPAY5L+m+6f1xq86VJ/RwiDLEkOStoi6XDZumPRQNfJ2LYWve4hDCqdJNyyuCxaPwjcA3wkWkYJF8NC4JCk7XkNcvAavREeBhaY2VeAvwKfjAajKs0eEB/YmGVmHycM4kAYLT0DnJJ0xszGK6tuwsjlTsKtaR2EYfNnou1zzQeMXC5408Xlgge6ywUPdJcLHuguFzzQXS54oLtc8EB3ufB/TRDNV7fb9UsAAAAASUVORK5CYII=\n",
98+
"text/plain": [
99+
"<Figure size 192x86.4 with 1 Axes>"
100+
]
101+
},
102+
"metadata": {
103+
"needs_background": "light"
104+
},
105+
"output_type": "display_data"
106+
}
107+
],
108+
"source": [
109+
"fig, ax = plt.subplots(figsize=(4/1.5,3/2.5))\n",
110+
"fontsize=6\n",
111+
"plt.bar(x=subset['klf-number'], height=subset['TPM'], color='#969696')\n",
112+
"plt.errorbar(x=subset['klf-number'], y=subset['TPM'], yerr=[lower_bound, upper_bound], fmt='none', c='black', capsize=2)\n",
113+
"plt.xticks([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18])\n",
114+
"ax.set_xticklabels([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18], fontsize = fontsize)\n",
115+
"plt.ylabel('TPM', fontsize=fontsize)\n",
116+
"plt.xlabel('KLF number', fontsize=fontsize)\n",
117+
"plt.tick_params(axis='both', labelsize=fontsize)\n",
118+
"ax.spines['top'].set_visible(False)\n",
119+
"ax.spines['right'].set_visible(False)\n",
120+
"plt.savefig('KLF_expression.pdf', dpi=300)"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": 9,
126+
"metadata": {},
127+
"outputs": [],
128+
"source": [
129+
"target_ids = []\n",
130+
"for i in range(9):\n",
131+
" tf = 'SP' + str(i+1)\n",
132+
" target_ids.append(t2g[t2g['ext_gene']==tf]['ens_gene'].to_list()[0])"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": 10,
138+
"metadata": {},
139+
"outputs": [
140+
{
141+
"name": "stderr",
142+
"output_type": "stream",
143+
"text": [
144+
"<ipython-input-10-b44ec6e27bb2>:3: SettingWithCopyWarning: \n",
145+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
146+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
147+
"\n",
148+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
149+
" subset['ext_gene']=subset['gene_id'].map(t2g_dict)\n"
150+
]
151+
}
152+
],
153+
"source": [
154+
"subset = encode[encode['gene_id'].isin(target_ids)]\n",
155+
"t2g_dict = dict(zip(t2g['ens_gene'], t2g['ext_gene']))\n",
156+
"subset['ext_gene']=subset['gene_id'].map(t2g_dict)\n",
157+
"subset = subset.sort_values(by='ext_gene')\n",
158+
"subset['sp-number']=[1,2,3,4,5,6,7,8,9]\n",
159+
"subset = subset.sort_values(by='sp-number')"
160+
]
161+
},
162+
{
163+
"cell_type": "code",
164+
"execution_count": 11,
165+
"metadata": {},
166+
"outputs": [
167+
{
168+
"data": {
169+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAHcAAABkCAYAAAC8e6+/AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAGrUlEQVR4nO2db4hdxRmHn59Y/JNUJWTFSpsajBoUrEmXSquSpSBu3FsJNC20H+InYxMFY5oSaQN701pLkWwTg3s1Bg2FfhGKf1hKUBq2tIiyK1nbSlvvGhPESJNNm2Laalvy9sPc3exuzjnJZufeczP7PjDcvWfmvvPe/TFz58yZeUdmhpMmF5TtgNM8XNyEcXETxsVNGBc3YVzchClV3O7ubgM8zS7lUqq4Y2NjZVafPN4tJ0yS4larVSRNpGq1WrZLpaAypx87OztteHi4Kba7uroAGBwcbIr9NkJ5GRe20otmU6vVJv4+fPjwlGvr1q0rxacySbJbdgJJijswMEC9Xqder7N+/XoGBgbKdqkUonTLkhYD9wIngKPAQmCemf0whv2ZUqlUqFQqZVTdVsRqufcDf2v8fYuZbQOQdMX0gpLWShqWNHz06NFI1TtZxBL3YuDXwB+AVY1rmcNwM9tlZp1m1tnR0RGpeieLWKPlPcAa4FNAVdJ3AczseCT7zjkQRVwzGwFGYthy4pHkaNkJuLgJ4+ImjIubMC5uwri4CePiJoyLmzAubsK4uAnj4iaMi5swLm7CRFsgJ6mH8ND+l5S8EsMJRGm5kpYRHtgf4AwrMZzWkdtyJY0Ag8BJADPbWGBnJfAvYNl4eXJWYkhaC6wFWLRo0Uz9dWZAUbd8H3A38Hfg2SIjZvYYgKRrgP1FKzHMbBewC8Ki9HNx2jk7isQdAa4HvgBcRljZWIiZbYjilROFot/cPwNfBj4AVrfGHScmRS33q4QFb/9tkS9OZIrE/V7jdcTMdrfCGScuRd3yATN7kPB765yHFIm7SVIfsLzx6pxnFHXLTwD9FOz/dNqbopa7AtjQSA+1wplarUZPT8+UXfE9PT2tqDpJilruXjPb0TJPnOgUiftky7yYhG+/jEdut2xm/2ulI058/Hluwri4CePiJoyLmzCxAp7cAXwFuA7YBywGLgc2mx+iUBpRWq6Z/dbMfgocBL5hZj8G/kh4FjwFD3jSOqJ1y5K+DbxHEDgXD3jSOmItkPsmIeBJBzAi6fvATcBbMew750asgCfPA8/HsOXEw0fLCePiJoyLmzAubsK4uAnj4iaMi5swLm7CuLgJMyfFnSvnDs05cWu1GkNDQ1OuDQ0NTTm2JhWSOlfobJntCstarUZfXx+jo6MT15YsWUK9Xo/hXjTOO3Gr1Spbt26deN/b21tKt7pxY1GggfYgercsabmkRyX1SZoX0/Zc6lJj0Izf3G8BVeBF4M7YxiuVCv39/ROpWQvYWzHoanYd0Q9qlPQ48AhwG7DAzF6clj8R8AS4AfhLjqmFQNEBu2fKn62Nq4HPTHr/IXB4BjZuAOZPen+C07/rmeo4GxtjZtadUT+YWdQEfBH4EbANmD8LO8OzyW8XG63yMytFH1CZ2ZvAm7HtOjNnzt3nziXaWdxds8xvFxut8vM0Sj352mku7dxynVnSluJKul7SHkmrcvLvkLRZ0m5JCzLyb5K0UdJOSQsL6umR9HJO3ipJOyRtkJQZF0TSYklVSZskfTojf0Xj84ON0InT878m6TFJ/ZKuzaljpaReSU9IujTvu2TRluKa2TuEkz3z8idvX7kiI/9t4K/AZ8kJkjYt0mwW/wQ+IgRay/s/TT43+LR6zOw3wE7gT2Z2MOPz/wYWAJcCR3Lq6AYeJ9zfzmhSqC3FPRvGt6+YWaY4ZvYLYDeQFxp2JfA5YFlD6Omff9XMtgDvEoK/ZDH53OC8yCyrgMzegTBJ8SDwXEEdO4H1wFJmGM2vLR8cSLqKEG/yEkn7zezQtPzx7St7JX0+I78buBm4FujNqsMmRZo1s/0ZPnQBtxJ2LP4gx9U9nDo3eFtOmbsILTyLMWALYUfkz3LKXNR4PQK8klMmEx8tJ8x52y07Z8bFTRgXN2Fc3IRpy9FyLBqj6hsJz0F/DmwF6oSN4Q+Y2cezsL3d2jzsf9LiEu5jx4B9ZnZE0idm1idpM2GCY7Rxy/N14BjwO+B2YDvQBRwHvkM4peVLwOvA7Wa2Blgq6X7gGsLKk58AhwgTHycJt1C/MrM3mv4tc0i6W7ZwvtHLwGpJ9wAXSXoYOGlmo5OK7gN2ECLyGCEM8fiU4zEzewr4GHgGOCjpMuCImT3dKDd+yMc/CKEjAF4qU1hIvOU2BF1CWArzPvCJmWVNFlgjXQC8BjxAEOkFTs0K/cfMTNLJRrkrGy1XwO8JPcTFhCg+HZw6X6k0fBIjYZLuluc6Lm7CuLgJ4+ImjIubMC5uwri4CfN/ibOBuwkDRtcAAAAASUVORK5CYII=\n",
170+
"text/plain": [
171+
"<Figure size 115.2x86.4 with 1 Axes>"
172+
]
173+
},
174+
"metadata": {
175+
"needs_background": "light"
176+
},
177+
"output_type": "display_data"
178+
}
179+
],
180+
"source": [
181+
"fig, ax = plt.subplots(figsize=(4/2.5,3/2.5))\n",
182+
"fontsize=6\n",
183+
"lower_bound = np.asarray(subset['TPM']-subset['TPM_ci_lower_bound'])\n",
184+
"lower_bound = lower_bound*(lower_bound>0)\n",
185+
"upper_bound = np.asarray(subset['TPM_ci_upper_bound']-subset['TPM'])\n",
186+
"upper_bound = upper_bound*(upper_bound>0)\n",
187+
"plt.bar(x=subset['sp-number'], height=subset['TPM'], color='#969696')\n",
188+
"plt.errorbar(x=subset['sp-number'], y=subset['TPM'], yerr=[lower_bound, upper_bound], fmt='none', c='black', capsize=2)\n",
189+
"plt.xticks([1,2,3,4,5,6,7,8,9])\n",
190+
"plt.ylabel('TPM', fontsize=fontsize)\n",
191+
"plt.xlabel('SP number', fontsize=fontsize)\n",
192+
"plt.tick_params(axis='both', labelsize=fontsize)\n",
193+
"ax.set_box_aspect(0.8)\n",
194+
"ax.spines['top'].set_visible(False)\n",
195+
"ax.spines['right'].set_visible(False)\n",
196+
"plt.savefig('SP_expression.pdf', dpi=300)"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": null,
202+
"metadata": {},
203+
"outputs": [],
204+
"source": []
205+
}
206+
],
207+
"metadata": {
208+
"kernelspec": {
209+
"display_name": "Python 3",
210+
"language": "python",
211+
"name": "python3"
212+
},
213+
"language_info": {
214+
"codemirror_mode": {
215+
"name": "ipython",
216+
"version": 3
217+
},
218+
"file_extension": ".py",
219+
"mimetype": "text/x-python",
220+
"name": "python",
221+
"nbconvert_exporter": "python",
222+
"pygments_lexer": "ipython3",
223+
"version": "3.9.7"
224+
}
225+
},
226+
"nbformat": 4,
227+
"nbformat_minor": 4
228+
}

0 commit comments

Comments
 (0)