Last active
September 14, 2020 04:35
-
-
Save numpde/772cd596fb5fe6036f7e29736bd1cf15 to your computer and use it in GitHub Desktop.
How to process a CEL file from Python (via R)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# RA, 2020-09-12 | |
# A look at the GSE60880: | |
# Human Lung Fibroblasts treated with TGFbeta, IL1, EGF and small molecule inhibitors of TGFBR1 and p38. | |
# HLF cells were incubated with each agent for 0.5 hours, 1 hour, 2 hours or 8 hours in standard conditions | |
# before total RNA was prepared and hybridised to Affymetrix microarrays. | |
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60880 | |
from tcga.utils import download | |
download = download.to(rel_path="download") | |
# Note: CDF for "Affymetrix Human Genome U133A 2.0 Array" | |
# can be obtained from | |
# http://www.affymetrix.com/support/technical/byproduct.affx?product=hgu133-20 | |
# in "Human Genome U133A 2.0 Array (zip, 13 MB)" | |
# Note: CDF format is described at | |
# http://dept.stat.lsa.umich.edu/~kshedden/Courses/Stat545/Notes/AffxFileFormats/cdf.html | |
# | |
url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE60880&format=file" | |
import io | |
from pathlib import Path | |
import tempfile | |
import pandas as pd | |
import tarfile, gzip | |
from Bio.Affy import CelFile | |
with download(url).now.open(mode='rb') as fd: | |
with tarfile.TarFile(fileobj=fd) as tf: | |
for m in tf.getmembers(): | |
with tf.extractfile(m) as zf: | |
with gzip.open(zf) as fd: | |
# cf = CelFile.read(io.StringIO(fd.read().decode())) | |
with tempfile.NamedTemporaryFile(mode='wb') as tmp: | |
tmp.write(fd.read()) | |
from rpy2.robjects.packages import importr | |
from rpy2.robjects import r as R, pandas2ri, StrVector | |
importr('affy') | |
data = (R['ReadAffy'])(filenames=StrVector([tmp.name]), sampleNames=StrVector([Path(m.name).stem])) | |
eset = (R['expresso'])(data, bgcorrect_method="rma", normalize_method="quantiles", pmcorrect_method="pmonly", summary_method="medianpolish") | |
df = pandas2ri.rpy2py((R['as'])(eset, "data.frame")).T | |
print(df) | |
exit() | |
# Corresponding R code: | |
# | |
# if (!requireNamespace("BiocManager", quietly = TRUE)) | |
# install.packages("BiocManager") | |
# | |
# BiocManager::install("affy") | |
# BiocManager::install("hgu133a2cdf") | |
# | |
# library("affy") | |
# browseVignettes("affy") | |
# | |
# data = ReadAffy(filenames=c("/home/ra/repos/cbb/20200912-TranscrResponse/download/wMs5HrzYcF7WviVIKryKpQK9AclIB6cC5vn_WeP9hM0=/data (1)/GSM1492784_GP866_3_209714002_92367.CEL")) | |
# data | |
# | |
# # http://127.0.0.1:29746/library/affy/doc/affy.pdf | |
# pm(data) # perfect match | |
# mm(data) # mismatch | |
# probeNames(data) # affyID | |
# geneNames(data) | |
# sampleNames(data) | |
# featureNames(data) | |
# | |
# summaryAffyRNAdeg(AffyRNAdeg(data)) | |
# | |
# eset = expresso(data, bgcorrect.method="rma", | |
# normalize.method="quantiles", | |
# pmcorrect.method="pmonly", | |
# summary.method="medianpolish") | |
# | |
# df = as(eset, "data.frame") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment