{ "cells": [ { "cell_type": "markdown", "id": "acf57be5", "metadata": {}, "source": [ "# Universal Decomposition of Perturbation Effects" ] }, { "cell_type": "markdown", "id": "1bbabca5", "metadata": {}, "source": [ "In this tutorial, we will systematically analyze a perturb-seq dataset by decomposing the perturbation effects on to the globally defined principal axes of cell state transitions." ] }, { "cell_type": "code", "execution_count": 1, "id": "9ddaf721", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import scipy\n", "import matplotlib.pyplot as plt\n", "\n", "import scanpy as sc" ] }, { "cell_type": "markdown", "id": "c48144db", "metadata": {}, "source": [ "## Load and preprocess the perturb-seq dataset" ] }, { "cell_type": "markdown", "id": "447eb1f8", "metadata": {}, "source": [ "Load the example perturb-seq dataset. This is a CRISPRi dataset." ] }, { "cell_type": "code", "execution_count": 2, "id": "9f7881d2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 65337 × 32738\n", " obs: 'perturbation', 'read count', 'UMI count', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'ncounts', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts'\n", " var: 'ensembl_id', 'ncounts', 'ncells'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adata = sc.read_h5ad('data/tutorial_data/AdamsonWeissman2016_GSM2406681_10X010.h5ad')\n", "adata" ] }, { "cell_type": "markdown", "id": "3ecd8fd3", "metadata": {}, "source": [ "Normalize the total count of each cell" ] }, { "cell_type": "code", "execution_count": 3, "id": "53fae352", "metadata": {}, "outputs": [], "source": [ "sc.pp.normalize_total(adata, target_sum=1e4)" ] }, { "cell_type": "markdown", "id": "00b4dbe9", "metadata": {}, "source": [ "Let's have a look at what genes are perturbed in this dataset and the number of cells for each perturbation." ] }, { "cell_type": "code", "execution_count": 4, "id": "6bcfb126", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "63(mod)_pBA580\t6010\n", "Gal4-4(mod)_pBA582\t1283\n", "IER3IP1_pDS002\t1222\n", "SEC61B_pDS033\t1185\n", "ASCC3_pDS052\t1142\n", "DNAJC19_pDS026\t1000\n", "HSPA9_pDS088\t894\n", "HSPA5_pDS017\t881\n", "YIPF5_pDS186\t817\n", "SAMM50_pDS156\t817\n", "TARS_pDS405\t803\n", "XRN1_pDS411\t788\n", "GBF1_pDS043\t760\n", "TIMM23_pDS284\t757\n", "DAD1_pDS499\t752\n", "SEC61A1_pDS032\t740\n", "AMIGO3_pDS434\t715\n", "SRPRB_pDS404\t703\n", "SRP68_pDS403\t699\n", "UFM1_pDS040\t692\n", "TELO2_pDS496\t691\n", "FECH_pDS494\t683\n", "MRPL39_pDS498\t683\n", "IARS2_pDS091\t673\n", "SCYL1_pDS160\t669\n", "DARS_pDS495\t664\n", "IDH3A_pDS393\t663\n", "DERL2_pDS042\t656\n", "SEC63_pDS218\t654\n", "TTI2_pDS408\t653\n", "SYVN1_pDS442\t650\n", "SEC61G_pDS440\t650\n", "SPCS3_pDS402\t647\n", "OST4_pDS353\t646\n", "ATP5B_pDS055\t644\n", "SLMO2_pDS433\t637\n", "FARSB_pDS390\t632\n", "TMED10_pDS036\t626\n", "PSMD4_pDS488\t625\n", "EIF2B4_pDS491\t625\n", "SLC35B1_pDS046\t621\n", "EIF2B2_pDS463\t615\n", "ASCC3_pDS051\t614\n", "DDRGK1_pDS041\t607\n", "GMPPB_pDS391\t606\n", "SLC39A7_pDS219\t601\n", "SRPR_pDS482\t591\n", "TIMM44_pDS430\t590\n", "SPCS2_pDS401\t587\n", "UFL1_pDS410\t585\n", "PDIA6_pDS029\t583\n", "IER3IP1_pDS110\t582\n", "YIPF5_pDS226\t574\n", "MRGBP_pDS124\t570\n", "SCYL1_pDS159\t561\n", "P4HB_pDS397\t560\n", "MANF_pDS027\t554\n", "PTDSS1_pDS478\t534\n", "TTI1_pDS407\t526\n", "DNAJC19_pDS074\t525\n", "MARS_pDS394\t523\n", "SRP72_pDS505\t520\n", "MTHFD1_pDS395\t520\n", "EIF2S1_pDS386\t511\n", "DHDDS_pDS383\t507\n", "NEDD8_pDS396\t506\n", "TMED2_pDS175\t505\n", "GNPNAT1_pDS506\t503\n", "CARS_pDS460\t501\n", "STT3A_pDS011\t499\n", "HYOU1_pDS089\t494\n", "ARHGAP22_pDS458\t490\n", "SEC61B_pDS162\t483\n", "CHERP_pDS024\t479\n", "SARS_pDS467\t473\n", "TMEM167A_pDS038\t468\n", "KCTD16_pDS096\t464\n", "SEC61A1_pDS031\t451\n", "SOCS1_pDS479\t450\n", "AARS_pDS381\t447\n", "SEL1L_pDS373\t443\n", "IARS2_pDS090\t438\n", "EIF2B3_pDS508\t415\n", "HSD17B12_pDS087\t404\n", "DDOST_pDS382\t382\n", "CCND3_pDS006\t379\n", "QARS_pDS510\t379\n", "HARS_pDS466\t325\n", "ATF4_pBA608\t325\n", "HSPA5_pDS371\t301\n", "COPZ1_pDS462\t282\n", "CAD_pDS468\t281\n", "PPWD1_pDS398\t234\n", "COPB1_pDS065\t225\n", "GBF1_pDS044\t198\n", "EIF2AK3_pBA572\t141\n", "*\t101\n", "PSMD12_pDS008\t49\n", "ERN1_pBA575\t43\n", "ERN1_pBA574\t40\n", "XBP1_pBA579\t34\n", "PSMA1_pDS007\t29\n", "XBP1_pBA578\t28\n", "IER3IP1_pDS003\t26\n", "ATF4_pBA577\t23\n", "EIF2AK3_pBA573\t20\n", "CCND3_pDS005\t18\n", "C7orf26_pDS004\t12\n", "PSMD12_pDS009\t11\n", "ATF6_pBA586\t10\n", "YIPF5_pDS001\t9\n", "ATF4_pBA576\t5\n", "62(mod)_pBA581\t2\n", "STT3A_pDS010\t1\n" ] } ], "source": [ "condition_col = 'perturbation'\n", "condition_cell_counts = adata.obs[condition_col].value_counts()\n", "for c in condition_cell_counts.index:\n", " print(f'{c}\\t{condition_cell_counts.loc[c]}')" ] }, { "cell_type": "markdown", "id": "6d23ee02", "metadata": {}, "source": [ "Let's perform a quality control. Let's filter out perturbation with too few cells and perturbations that don't effectively knock down the target genes." ] }, { "cell_type": "code", "execution_count": 5, "id": "97c1bd8e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_752705/3671995774.py:33: RuntimeWarning: invalid value encountered in scalar divide\n", " fc = np.mean(perturbed_exps) / np.mean(control_exps)\n" ] }, { "data": { "text/html": [ "
| \n", " | perturbed_gene | \n", "control_condition | \n", "fc | \n", "pval | \n", "
|---|---|---|---|---|
| condition | \n", "\n", " | \n", " | \n", " | \n", " |
| SEC61B_pDS033 | \n", "SEC61B | \n", "63(mod)_pBA580 | \n", "0.173211 | \n", "0.000000e+00 | \n", "
| ASCC3_pDS052 | \n", "ASCC3 | \n", "63(mod)_pBA580 | \n", "0.117991 | \n", "1.549447e-94 | \n", "
| DNAJC19_pDS026 | \n", "DNAJC19 | \n", "63(mod)_pBA580 | \n", "0.162940 | \n", "0.000000e+00 | \n", "
| HSPA9_pDS088 | \n", "HSPA9 | \n", "63(mod)_pBA580 | \n", "0.287186 | \n", "1.990341e-254 | \n", "
| HSPA5_pDS017 | \n", "HSPA5 | \n", "63(mod)_pBA580 | \n", "0.462840 | \n", "4.543529e-96 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| COPZ1_pDS462 | \n", "COPZ1 | \n", "63(mod)_pBA580 | \n", "0.306617 | \n", "2.228928e-69 | \n", "
| CAD_pDS468 | \n", "CAD | \n", "63(mod)_pBA580 | \n", "0.211891 | \n", "2.448909e-17 | \n", "
| PPWD1_pDS398 | \n", "PPWD1 | \n", "63(mod)_pBA580 | \n", "0.305571 | \n", "1.135729e-08 | \n", "
| COPB1_pDS065 | \n", "COPB1 | \n", "63(mod)_pBA580 | \n", "0.503718 | \n", "1.162430e-07 | \n", "
| GBF1_pDS044 | \n", "GBF1 | \n", "63(mod)_pBA580 | \n", "0.331636 | \n", "2.903447e-04 | \n", "
87 rows × 4 columns
\n", "| gene_symbol | \n", "MIR1302-10 | \n", "FAM138A | \n", "OR4F5 | \n", "RP11-34P13.7 | \n", "RP11-34P13.8 | \n", "AL627309.1 | \n", "RP11-34P13.14 | \n", "RP11-34P13.9 | \n", "AP006222.2 | \n", "RP4-669L17.10 | \n", "... | \n", "KIR3DL2-1 | \n", "AL590523.1 | \n", "CT476828.1 | \n", "PNRC2-1 | \n", "SRSF10-1 | \n", "AC145205.1 | \n", "BAGE5 | \n", "CU459201.1 | \n", "AC002321.2 | \n", "AC002321.1 | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 63(mod)_pBA580 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000095 | \n", "0.003224 | \n", "0.0 | \n", "0.0 | \n", "0.000555 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000388 | \n", "0.009653 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| AARS_pDS381 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.006999 | \n", "0.0 | \n", "0.0 | \n", "0.000964 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.011662 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| ARHGAP22_pDS458 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.003068 | \n", "0.002012 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.011770 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| ASCC3_pDS051 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.003492 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.008839 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| ASCC3_pDS052 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.001639 | \n", "0.002893 | \n", "0.0 | \n", "0.0 | \n", "0.001667 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.012430 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| UFL1_pDS410 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.001283 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000954 | \n", "0.009231 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| UFM1_pDS040 | \n", "0.000432 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000763 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.009446 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| XRN1_pDS411 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.002461 | \n", "0.005934 | \n", "0.0 | \n", "0.0 | \n", "0.000889 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.011627 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| YIPF5_pDS186 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.006665 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000531 | \n", "0.011071 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| YIPF5_pDS226 | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.004654 | \n", "0.0 | \n", "0.0 | \n", "0.001430 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.001212 | \n", "0.015296 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
88 rows × 32738 columns
\n", "