Created
June 13, 2016 07:57
-
-
Save brunifrancesco/805e6dd3282b0449f30f0d1bf88959b6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Get Paris attractions data scraping the BASE_URL site. | |
Wrap those data into a json structure. | |
Deps: | |
- BeautifulSoup | |
- requests | |
- fn | |
""" | |
from bs4 import BeautifulSoup | |
import json | |
import requests | |
from collections import deque | |
from fn.monad import Option | |
import unittest | |
BASE_URL = "http://www.parigi.it/it/cosa_vedere_a_parigi.php" | |
class ScrapedData: | |
""" | |
Dummy container to store informations retrieved from scraped content | |
""" | |
def __init__(self, img, subtitle, text, other_infos, correlated): | |
self.img = img | |
self.subtitle = subtitle | |
self.text = text | |
self.other_infos = other_infos | |
self.correlated = correlated | |
def to_dict(self): | |
return self.__dict__ | |
def __str__(self): | |
return self.subtitle | |
def do_basic_search(): | |
""" | |
Scrape the main page to get attraction page urls; | |
process each retrieved url | |
""" | |
print("Scraping main page") | |
soup = Option(requests.get(BASE_URL)).map(lambda result: result.text).map(lambda html: BeautifulSoup(html, "html.parser")).get_or("") | |
return [scrape_content(url) for url in soup.findAll("div", {"class": "paragrafo_correlazioni_box"})] | |
def scrape_content(element): | |
""" | |
Scrape the single page getting relevant information | |
and wrap them in a ScrapedData instance | |
""" | |
print("Processing %s" % element.find("a").attrs["title"]) | |
try: | |
data = requests.get(element.find("a").attrs["href"]).text | |
soup = BeautifulSoup(data, "html.parser").find("div", {"class":"centro"}) | |
return ScrapedData( | |
img = Option(soup.find("div", {"class": "view"})).map(lambda div: div.find("a")).map(lambda link: link.attrs["href"]).get_or("Image not available"), | |
subtitle = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda div: div.find("h3")).map(lambda result: str(result)).get_or("Content not available"), | |
text = Option(soup.find("div", {"class": "paragrafo_1_testo"})).map(lambda item: item.find("p")).map(lambda result: str(result)).get_or("Content not available"), | |
other_infos = str("<br />".join((str(item) for item in soup.findAll("table", {"class":"tabella_contenuti"})))), | |
correlated = map(lambda element: element.find("a").attrs.get("title", ""), soup.findAll("div", {"class":"paragrafo_correlazioni_box"})) | |
) | |
except Exception as e: | |
print(e) | |
print("error in processing %s" %element.find("a").attrs["href"]) | |
class TestScraping(unittest.TestCase): | |
def test_scraping(self): | |
result = do_basic_search() | |
assert result | |
self.assertTrue(len(result) > 2) | |
print( | |
Option(map(lambda item: item.to_dict(), result)). | |
map(lambda lst: dict(result=lst)). | |
map(lambda result: json.dumps(result)). | |
get_or(json.dumps(dict(result="No data available")))) | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment