Skip to content

Instantly share code, notes, and snippets.

@brunifrancesco
Created June 13, 2016 07:57
Show Gist options
  • Save brunifrancesco/805e6dd3282b0449f30f0d1bf88959b6 to your computer and use it in GitHub Desktop.
Save brunifrancesco/805e6dd3282b0449f30f0d1bf88959b6 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Get Paris attractions data scraping the BASE_URL site.
Wrap those data into a json structure.
Deps:
- BeautifulSoup
- requests
- fn
"""
from bs4 import BeautifulSoup
import json
import requests
from collections import deque
from fn.monad import Option
import unittest
BASE_URL = "http://www.parigi.it/it/cosa_vedere_a_parigi.php"
class ScrapedData:
"""
Dummy container to store informations retrieved from scraped content
"""
def __init__(self, img, subtitle, text, other_infos, correlated):
self.img = img
self.subtitle = subtitle
self.text = text
self.other_infos = other_infos
self.correlated = correlated
def to_dict(self):
return self.__dict__
def __str__(self):
return self.subtitle
def do_basic_search():
"""
Scrape the main page to get attraction page urls;
process each retrieved url
"""
print("Scraping main page")
soup = Option(requests.get(BASE_URL)).map(lambda result: result.text).map(lambda html: BeautifulSoup(html, "html.parser")).get_or("")
return [scrape_content(url) for url in soup.findAll("div", {"class": "paragrafo_correlazioni_box"})]
def scrape_content(element):
"""
Scrape the single page getting relevant information
and wrap them in a ScrapedData instance
"""
print("Processing %s" % element.find("a").attrs["title"])
try:
data = requests.get(element.find("a").attrs["href"]).text
soup = BeautifulSoup(data, "html.parser").find("div", {"class":"centro"})
return ScrapedData(
img = Option(soup.find("div", {"class": "view"})).map(lambda div: div.find("a")).map(lambda link: link.attrs["href"]).get_or("Image not available"),
subtitle = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda div: div.find("h3")).map(lambda result: str(result)).get_or("Content not available"),
text = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda item: item.find("p")).map(lambda result: str(result)).get_or("Content not available"),
other_infos = str("<br />".join((str(item) for item in soup.findAll("table", {"class":"tabella_contenuti"})))),
correlated = map(lambda element: element.find("a").attrs.get("title", ""), soup.findAll("div", {"class":"paragrafo_correlazioni_box"}))
)
except Exception as e:
print(e)
print("error in processing %s" %element.find("a").attrs["href"])
class TestScraping(unittest.TestCase):
def test_scraping(self):
result = do_basic_search()
assert result
self.assertTrue(len(result) > 2)
print(
Option(map(lambda item: item.to_dict(), result)).
map(lambda lst: dict(result=lst)).
map(lambda result: json.dumps(result)).
get_or(json.dumps(dict(result="No data available"))))
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment