Last active
August 10, 2023 12:36
-
-
Save MaxHalford/e5ecbf98e0eb5776511e88c1e2c152f5 to your computer and use it in GitHub Desktop.
Scraping chambres d'hôtes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from PIL import Image | |
import pytesseract | |
from io import BytesIO | |
from dataclasses import dataclass | |
from tqdm import tqdm | |
@dataclass | |
class Ad: | |
name: str | |
mobile: str | None = None | |
email: str | None = None | |
website: str | None = None | |
ads = [] | |
link = "http://www.annuairechambresdhotes.com" | |
content = requests.get(link).content.decode() | |
soup = BeautifulSoup(content) | |
for a in tqdm(soup.find_all('a', href=True)): | |
if not a['href'].startswith('http://www.annuairechambresdhotes.com/'): | |
continue | |
link = a['href'] | |
content = requests.get(link).content.decode() | |
soup = BeautifulSoup(content) | |
for listing in soup.find_all("div", {"class": "item-list"}): | |
# Extract email | |
lis = [li for li in listing.find_all("li") if li.text.startswith("Email")] | |
if lis: | |
li = lis[0] | |
img_link = li.find("img")["src"] | |
img_link = f"http://www.annuairechambresdhotes.com{img_link}" | |
img_response = requests.get(img_link) | |
img = Image.open(BytesIO(img_response.content), formats=["png"]) | |
email_text = pytesseract.image_to_string(img).strip() | |
else: | |
email_text = None | |
# Extract phone | |
lis = [li for li in listing.find_all("li") if li.text.startswith("Tel")] | |
if lis: | |
li = lis[0] | |
img_link = li.find("img")["src"] | |
img_link = f"http://www.annuairechambresdhotes.com{img_link}" | |
img_response = requests.get(img_link) | |
img = Image.open(BytesIO(img_response.content), formats=["png"]) | |
phone_text = pytesseract.image_to_string(img).strip() | |
else: | |
phone_text = None | |
# Extract website | |
li = [li for li in listing.find_all("li") if li.text.startswith("Site internet")][0] | |
website = li.find("span").text | |
ad = Ad( | |
name=listing.find("h3").find("a").text, | |
email=email_text, | |
mobile=phone_text, | |
website=website | |
) | |
ads.append(ad) | |
pd.DataFrame(ads).to_csv('annuairechambresdhotes.com.csv', index=False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
from dataclasses import dataclass, asdict | |
from tqdm import tqdm | |
@dataclass | |
class Ad: | |
name: str | |
mobile: str | None = None | |
email: str | None = None | |
website: str | None = None | |
link: str | None = None | |
ads = [] | |
for page_no in tqdm(range(1, 9)): | |
url = f'https://www.chambresdhotes-france.fr/adresse/page/{page_no}/' | |
content = requests.get(url).content.decode() | |
soup = BeautifulSoup(content) | |
for article in soup.find_all('article'): | |
link = article.get('data-permalink') | |
content = requests.get(link).content.decode() | |
soup = BeautifulSoup(content) | |
try: | |
email = soup.find("li", {"id": "listing-email"}).find('a').text | |
except AttributeError: | |
email = None | |
try: | |
website = soup.find("li", {"id": "listing-website"}).find('a').get('href') | |
except AttributeError: | |
website = None | |
try: | |
mobile = soup.find("li", {"class": "listing-phone"}).find('a').text | |
except AttributeError: | |
mobile = None | |
ad = Ad( | |
name=soup.find("h1", {"class": "entry-title"}).text, | |
mobile=mobile, | |
email=email, | |
website=website, | |
link=link | |
) | |
ads.append(ad) | |
pd.DataFrame(ads).to_csv('chambresdhotes-france.fr.csv', index=False) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from dataclasses import dataclass, asdict | |
from bs4 import BeautifulSoup | |
import requests | |
@dataclass | |
class Ad: | |
department: str | |
name: str | |
description: str | None = None | |
landline: str | None = None | |
mobile: str | None = None | |
email: str | None = None | |
website: str | None = None | |
img: str | None = None | |
def iter_departments(): | |
url = 'https://www.les-chambresdhotes.fr/annuaire/nouv_site/mobile/index_page_principal_mob.php' | |
content = requests.get(url).content.decode() | |
soup = BeautifulSoup(content) | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
if href and href.startswith('https://www..fr'): | |
dep = href.split('/')[-2] | |
if dep: | |
yield dep | |
ads = [] | |
for dep in iter_departments(): | |
url = f'https://www.les-chambresdhotes.fr/chambre-d-hotes-{dep}/' | |
dep_no = dep.split('-')[-1] | |
content = requests.get(url).content.decode() | |
soup = BeautifulSoup(content) | |
for _ad in soup.find_all(name='div', attrs={'class': 'annonce_nouveau'}): | |
phone_numbers = ( | |
( | |
tag | |
.find(name='p') | |
.get_text(strip=True, separator='\n') | |
.splitlines() | |
) | |
if (tag := _ad.find(name='div', attrs={'class': 'tel_nouveau'})) | |
else [] | |
) | |
landline = None | |
mobile = None | |
for phone_number in phone_numbers: | |
if phone_number.startswith('Tel'): | |
landline = phone_number.split(':')[1].strip() | |
if phone_number.startswith('Mobile'): | |
mobile = phone_number.split(':')[1].strip() | |
links = ( | |
( | |
tag | |
.find(name='p') | |
.get_text(strip=True, separator='\n') | |
.replace(':\n', ':') | |
.splitlines() | |
) | |
if (tag := _ad.find(name='div', attrs={'class': 'lien_nouveau'})) | |
else [] | |
) | |
email = None | |
website = None | |
for link in links: | |
if link.startswith('Mail'): | |
email = link.split(':')[1].strip() | |
if link.startswith('Site'): | |
website = link.split(':')[1].strip() | |
ad = Ad( | |
department=dep_no, | |
name=_ad.find(name='div', attrs={'class': 'titre_nouveau'}).find(name='p').text, | |
description=_ad.find(name='div', attrs={'class': 'comment_nouveau'}).find(name='p').text, | |
landline=landline, | |
mobile=mobile, | |
email=email, | |
website=website, | |
img=( | |
'https://www.les-chambresdhotes.fr' + | |
_ad.find(name='div', attrs={'class': 'image_nouveau2'}).find(name='img').get('src') | |
) | |
) | |
ads.append(ad) | |
pd.DataFrame(ads).to_csv('les-chambresdhotes.fr.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment