Skip to content

Instantly share code, notes, and snippets.

@eon01
Created January 12, 2014 08:12
Show Gist options
  • Save eon01/8382176 to your computer and use it in GitHub Desktop.
Save eon01/8382176 to your computer and use it in GitHub Desktop.
Scrap the tunisian constitution articles and save them to seperate CSV files in function of chapters.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
__author__ = "Aymen Amri"
__email__ = "[email protected]"
for counter in range (1, 6):
counter = str(counter)
baseUrl="http://www.marsad.tn/constitution/4/chapitre/"
url = baseUrl + counter
constFile = urllib2.urlopen(url)
constHtml = constFile.read()
soup = BeautifulSoup(constHtml)
constArt = soup.find_all("div")
articles = soup.findAll('div', { "class" : "clear texte" })
titles = soup.findAll('h4', { "class" : "clearfloat" })
print "."* int(counter)
import csv
fileName= counter +".csv"
w = csv.writer(open(fileName, "w"))
for article, title in zip(articles, titles):
textTitle = "".join(title.findAll(text=True))
textArticle = "".join(article.findAll(text=True))
out = [textTitle.encode('utf-8'), textArticle.encode('utf-8')]
w.writerow(out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment