Created
January 12, 2014 08:12
-
-
Save eon01/8382176 to your computer and use it in GitHub Desktop.
Scrap the tunisian constitution articles and save them to seperate CSV files in function of chapters.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from bs4 import BeautifulSoup | |
import urllib2 | |
__author__ = "Aymen Amri" | |
__email__ = "[email protected]" | |
for counter in range (1, 6): | |
counter = str(counter) | |
baseUrl="http://www.marsad.tn/constitution/4/chapitre/" | |
url = baseUrl + counter | |
constFile = urllib2.urlopen(url) | |
constHtml = constFile.read() | |
soup = BeautifulSoup(constHtml) | |
constArt = soup.find_all("div") | |
articles = soup.findAll('div', { "class" : "clear texte" }) | |
titles = soup.findAll('h4', { "class" : "clearfloat" }) | |
print "."* int(counter) | |
import csv | |
fileName= counter +".csv" | |
w = csv.writer(open(fileName, "w")) | |
for article, title in zip(articles, titles): | |
textTitle = "".join(title.findAll(text=True)) | |
textArticle = "".join(article.findAll(text=True)) | |
out = [textTitle.encode('utf-8'), textArticle.encode('utf-8')] | |
w.writerow(out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment