Skip to content

Instantly share code, notes, and snippets.

@josepsmartinez
Created June 1, 2023 21:03
Show Gist options
  • Save josepsmartinez/fe75898ea183fb73082461f8c4f3dfd1 to your computer and use it in GitHub Desktop.
Save josepsmartinez/fe75898ea183fb73082461f8c4f3dfd1 to your computer and use it in GitHub Desktop.
simbora
import requests
import io
import PyPDF2 # pip install PyPDF2
from seleniumwire import webdriver # pip install selenium-wire
def payload_from_gene_data(dataset: list[str], signature, cutoff_1, cutoff_2):
return {
'methodoption': 'os',
'dataset': '\n'.join(dataset),
'signature': 'signature',
'highcol': '#ff0000',
'lowcol': '#0000ff',
'groupcutoff1': str(cutoff_1),
'groupcutoff2': str(cutoff_2),
'axisunit': 'month',
'ifhr': 'hr',
'ifconf': 'conf',
'signature_norm': '',
'is_sub': 'false',
'subtype': '',
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'http://gepia2.cancer-pku.cn',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://gepia2.cancer-pku.cn/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
all_genes_data = [
payload_from_gene_data(dataset=['BRCA', 'COAD'], signature='ERBB2', cutoff_1=50, cutoff_2=50)
]
driver = webdriver.Chrome(seleniumwire_options={
'suppress_connection_errors': False
})
driver.get('http://gepia2.cancer-pku.cn/#survival')
cookies = requests.cookies.RequestsCookieJar()
for cookie in driver.get_cookies():
cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'], path=cookie['path'])
# breakpoint()
for data in all_genes_data:
# gera PDF no server
response = requests.post(
'http://gepia2.cancer-pku.cn/assets/PHP4/survival_zf.php',
data=data,
verify=False,
cookies=cookies,
headers=headers)
# baixa PDF
response_dict = eval(response.content.decode())
response = requests.get(
f'http://gepia2.cancer-pku.cn/tmp/{response_dict["outdir"]}',
cookies=cookies,
headers=headers)
pdf_bytes = response.content
# TODO: processa PDF
pdf_bytes.append('%%EOF\r\n')
read_pdf = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
page = read_pdf.getPage(0)
page_content = page.extractText()
breakpoint()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment