Skip to content

Instantly share code, notes, and snippets.

@mdamien
Created June 9, 2015 07:49
Show Gist options
  • Save mdamien/ffd9da8a7ad315b3f54b to your computer and use it in GitHub Desktop.
Save mdamien/ffd9da8a7ad315b3f54b to your computer and use it in GitHub Desktop.
Scrapping example
import glob
import json
from pprint import pprint as pp
from pathlib import Path
from bs4 import BeautifulSoup
import csv
data = []
for i, filename in enumerate(glob.glob("html/*")):
infos = {
'id': int(Path(filename).stem)
}
try:
with open(filename) as f:
soup = BeautifulSoup(f)
infos['titre'] = soup.find(id="tx_titre").text
infos['sujet'] = soup.find(id="sujet").text
infos['moyens'] = soup.find(id="travaux").text
infos['commentaire'] = soup.find(id="commentaire").text
infos['resp1'] = soup.find(id="resp1").text
infos['resp2'] = soup.find(id="resp2").text
except Exception as e:
print(e)
pp(infos)
data.append(infos)
if i % 100 == 0:
print(i)
data.sort(key=lambda x:x['id'])
json.dump(data, open('data/parsed.json','w'), indent=2)
with open('data/parsed.csv', 'w') as f:
fieldnames = list(data[0].keys())
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
for line in data:
w.writerow(line)
import requests
URL = "https://demeter.utc.fr/pls/portal30/enseignement.tx_etu_visu_page.show"
headers = {
"Host": "demeter.utc.fr",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Referer": "https://demeter.utc.fr/pls/portal30/ENSEIGNEMENT.TX_INSCRIPTIONS_DYN.show",
"Cookie": "portal30=9.0.3+en-us+us+AMERICA+180EFE62A40E3772E050A8C00A8D22A2+0BFD25221B37A6ACF6CE057E2C413B2551F9272D5C43DD92B67D360932DEB6E09D320B8C14E9E544B66B4C8EC253B3235D14BEFAC3064FEE52191945A1D2580B0978C1D7B20042F3294BBDC68F777E04618A7060EF00BBED; SSO_ID=v1.2~1~6377949A361B8F5ED3EBD324857CC62400E1F68FC6C187CBA239A384E0D1F38173E0B28D599C4A5E841A37F7C3F6BACAA707BA0B07E8FE6396B5E86FA8B3E3948CC11E56CB835ED08D7B8F3887DB339E137E8DF4BFD9FF9AD9BA3D49F5C4151CEE58D9FDC0FC2D28E1B6498B474A2CCDAAC5C04AF06B500C196AC74EC68F4B25BC9F475097000FA7F7560290238B54536CDC3B8EE4CC980BD2D4FF4FA651971F53F96DABC1F67FED08E86779A6C03A40331C397C729B55ABFDB897827DDA4C5DF1BC4CAD9E400D2F1B634A0D5489EB80892D8AD1FB90FF2C733392C74793105DC7CFE37A25A274684CD60778CEB75394; OSSO_USER_CTX=v1.0~275F8A35283A10AFD4138F25CC567CA6FC62593A4B0DE735E657F10975E5050333BD24CF0B0366E9877A1E22639837BCCB3FB078D2C6ADEBF5D5BCE41F38833B3E2D8A11E6041E3A994F343D6AEAA271E47696761BB715D99429E7D0B28FC2D4A6888E87462AA786",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
}
params = "p_arg_names=individu_id&p_arg_values=66076&p_arg_names=tx_suj_id&p_arg_values={sujet_id}&p_arg_names=v_droit_postule&p_arg_values=1&p_arg_names=p_col&p_arg_values=&p_arg_names=v_annee&p_arg_values=2014&p_arg_names=v_periode&p_arg_values=03&p_arg_names=v_droit_visu&p_arg_values=1"
sujet_id = 0
while True:
final_url = URL+"?"+params.format(sujet_id=sujet_id)
r = requests.post(final_url, headers=headers)
with open('html/%d.html' % sujet_id,'w') as f:
f.write(r.text)
print(sujet_id)
sujet_id += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment