numpde · September 14, 2020 04:35
diff --git a/affy_in_python b/affy_in_python
 # RA, 2020-09-12

 # A look at the GSE60880:
 #   Human Lung Fibroblasts treated with TGFbeta, IL1, EGF and small molecule inhibitors of TGFBR1 and p38.
 #   HLF cells were incubated with each agent for 0.5 hours, 1 hour, 2 hours or 8 hours in standard conditions
 #   before total RNA was prepared and hybridised to Affymetrix microarrays.
 #   https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60880

 from tcga.utils import download
 download = download.to(rel_path="download")

 # Note: CDF for "Affymetrix Human Genome U133A 2.0 Array"
 # can be obtained from
 # http://www.affymetrix.com/support/technical/byproduct.affx?product=hgu133-20
 # in "Human Genome U133A 2.0 Array (zip, 13 MB)"

 # Note: CDF format is described at
 # http://dept.stat.lsa.umich.edu/~kshedden/Courses/Stat545/Notes/AffxFileFormats/cdf.html
 #

 url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE60880&format=file"
 import io
 from pathlib import Path
 import tempfile
 import pandas as pd
 import tarfile, gzip
 from Bio.Affy import CelFile

 with download(url).now.open(mode='rb') as fd:
    with tarfile.TarFile(fileobj=fd) as tf:
        for m in tf.getmembers():
            with tf.extractfile(m) as zf:
                with gzip.open(zf) as fd:
                    # cf = CelFile.read(io.StringIO(fd.read().decode()))
                    with tempfile.NamedTemporaryFile(mode='wb') as tmp:
                        tmp.write(fd.read())

                        from rpy2.robjects.packages import importr
                        from rpy2.robjects import r as R, pandas2ri, StrVector
                        importr('affy')
                        data = (R['ReadAffy'])(filenames=StrVector([tmp.name]), sampleNames=StrVector([Path(m.name).stem]))
                        eset = (R['expresso'])(data, bgcorrect_method="rma", normalize_method="quantiles", pmcorrect_method="pmonly", summary_method="medianpolish")
                        df = pandas2ri.rpy2py((R['as'])(eset, "data.frame")).T
                        print(df)
                        
                        exit()
                        
 # Corresponding R code:
 #
 # if (!requireNamespace("BiocManager", quietly = TRUE))
 #   install.packages("BiocManager")
 # 
 # BiocManager::install("affy")
 # BiocManager::install("hgu133a2cdf")
 # 
 # library("affy")
 # browseVignettes("affy")
 # 
 # data = ReadAffy(filenames=c("/home/ra/repos/cbb/20200912-TranscrResponse/download/wMs5HrzYcF7WviVIKryKpQK9AclIB6cC5vn_WeP9hM0=/data (1)/GSM1492784_GP866_3_209714002_92367.CEL"))
 # data
 # 
 # # http://127.0.0.1:29746/library/affy/doc/affy.pdf
 # pm(data) # perfect match
 # mm(data) # mismatch
 # probeNames(data)   # affyID
 # geneNames(data)
 # sampleNames(data)
 # featureNames(data)
 # 
 # summaryAffyRNAdeg(AffyRNAdeg(data))
 # 
 # eset = expresso(data, bgcorrect.method="rma",
 #                 normalize.method="quantiles",
 #                 pmcorrect.method="pmonly",
 #                 summary.method="medianpolish")
 # 
 # df = as(eset, "data.frame")
	# RA, 2020-09-12

	# A look at the GSE60880:
	# Human Lung Fibroblasts treated with TGFbeta, IL1, EGF and small molecule inhibitors of TGFBR1 and p38.
	# HLF cells were incubated with each agent for 0.5 hours, 1 hour, 2 hours or 8 hours in standard conditions
	# before total RNA was prepared and hybridised to Affymetrix microarrays.
	# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE60880

	from tcga.utils import download
	download = download.to(rel_path="download")

	# Note: CDF for "Affymetrix Human Genome U133A 2.0 Array"
	# can be obtained from
	# http://www.affymetrix.com/support/technical/byproduct.affx?product=hgu133-20
	# in "Human Genome U133A 2.0 Array (zip, 13 MB)"

	# Note: CDF format is described at
	# http://dept.stat.lsa.umich.edu/~kshedden/Courses/Stat545/Notes/AffxFileFormats/cdf.html
	#

	url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE60880&format=file"
	import io
	from pathlib import Path
	import tempfile
	import pandas as pd
	import tarfile, gzip
	from Bio.Affy import CelFile

	with download(url).now.open(mode='rb') as fd:
	with tarfile.TarFile(fileobj=fd) as tf:
	for m in tf.getmembers():
	with tf.extractfile(m) as zf:
	with gzip.open(zf) as fd:
	# cf = CelFile.read(io.StringIO(fd.read().decode()))
	with tempfile.NamedTemporaryFile(mode='wb') as tmp:
	tmp.write(fd.read())

	from rpy2.robjects.packages import importr
	from rpy2.robjects import r as R, pandas2ri, StrVector
	importr('affy')
	data = (R['ReadAffy'])(filenames=StrVector([tmp.name]), sampleNames=StrVector([Path(m.name).stem]))
	eset = (R['expresso'])(data, bgcorrect_method="rma", normalize_method="quantiles", pmcorrect_method="pmonly", summary_method="medianpolish")
	df = pandas2ri.rpy2py((R['as'])(eset, "data.frame")).T
	print(df)

	exit()

	# Corresponding R code:
	#
	# if (!requireNamespace("BiocManager", quietly = TRUE))
	# install.packages("BiocManager")
	#
	# BiocManager::install("affy")
	# BiocManager::install("hgu133a2cdf")
	#
	# library("affy")
	# browseVignettes("affy")
	#
	# data = ReadAffy(filenames=c("/home/ra/repos/cbb/20200912-TranscrResponse/download/wMs5HrzYcF7WviVIKryKpQK9AclIB6cC5vn_WeP9hM0=/data (1)/GSM1492784_GP866_3_209714002_92367.CEL"))
	# data
	#
	# # http://127.0.0.1:29746/library/affy/doc/affy.pdf
	# pm(data) # perfect match
	# mm(data) # mismatch
	# probeNames(data) # affyID
	# geneNames(data)
	# sampleNames(data)
	# featureNames(data)
	#
	# summaryAffyRNAdeg(AffyRNAdeg(data))
	#
	# eset = expresso(data, bgcorrect.method="rma",
	# normalize.method="quantiles",
	# pmcorrect.method="pmonly",
	# summary.method="medianpolish")
	#
	# df = as(eset, "data.frame")