Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Last active August 10, 2023 12:36
Show Gist options
  • Save MaxHalford/e5ecbf98e0eb5776511e88c1e2c152f5 to your computer and use it in GitHub Desktop.
Save MaxHalford/e5ecbf98e0eb5776511e88c1e2c152f5 to your computer and use it in GitHub Desktop.
Scraping chambres d'hôtes
import requests
from bs4 import BeautifulSoup
from PIL import Image
import pytesseract
from io import BytesIO
from dataclasses import dataclass
from tqdm import tqdm
@dataclass
class Ad:
name: str
mobile: str | None = None
email: str | None = None
website: str | None = None
ads = []
link = "http://www.annuairechambresdhotes.com"
content = requests.get(link).content.decode()
soup = BeautifulSoup(content)
for a in tqdm(soup.find_all('a', href=True)):
if not a['href'].startswith('http://www.annuairechambresdhotes.com/'):
continue
link = a['href']
content = requests.get(link).content.decode()
soup = BeautifulSoup(content)
for listing in soup.find_all("div", {"class": "item-list"}):
# Extract email
lis = [li for li in listing.find_all("li") if li.text.startswith("Email")]
if lis:
li = lis[0]
img_link = li.find("img")["src"]
img_link = f"http://www.annuairechambresdhotes.com{img_link}"
img_response = requests.get(img_link)
img = Image.open(BytesIO(img_response.content), formats=["png"])
email_text = pytesseract.image_to_string(img).strip()
else:
email_text = None
# Extract phone
lis = [li for li in listing.find_all("li") if li.text.startswith("Tel")]
if lis:
li = lis[0]
img_link = li.find("img")["src"]
img_link = f"http://www.annuairechambresdhotes.com{img_link}"
img_response = requests.get(img_link)
img = Image.open(BytesIO(img_response.content), formats=["png"])
phone_text = pytesseract.image_to_string(img).strip()
else:
phone_text = None
# Extract website
li = [li for li in listing.find_all("li") if li.text.startswith("Site internet")][0]
website = li.find("span").text
ad = Ad(
name=listing.find("h3").find("a").text,
email=email_text,
mobile=phone_text,
website=website
)
ads.append(ad)
pd.DataFrame(ads).to_csv('annuairechambresdhotes.com.csv', index=False)
from bs4 import BeautifulSoup
import requests
from dataclasses import dataclass, asdict
from tqdm import tqdm
@dataclass
class Ad:
name: str
mobile: str | None = None
email: str | None = None
website: str | None = None
link: str | None = None
ads = []
for page_no in tqdm(range(1, 9)):
url = f'https://www.chambresdhotes-france.fr/adresse/page/{page_no}/'
content = requests.get(url).content.decode()
soup = BeautifulSoup(content)
for article in soup.find_all('article'):
link = article.get('data-permalink')
content = requests.get(link).content.decode()
soup = BeautifulSoup(content)
try:
email = soup.find("li", {"id": "listing-email"}).find('a').text
except AttributeError:
email = None
try:
website = soup.find("li", {"id": "listing-website"}).find('a').get('href')
except AttributeError:
website = None
try:
mobile = soup.find("li", {"class": "listing-phone"}).find('a').text
except AttributeError:
mobile = None
ad = Ad(
name=soup.find("h1", {"class": "entry-title"}).text,
mobile=mobile,
email=email,
website=website,
link=link
)
ads.append(ad)
pd.DataFrame(ads).to_csv('chambresdhotes-france.fr.csv', index=False)
import pandas as pd
from dataclasses import dataclass, asdict
from bs4 import BeautifulSoup
import requests
@dataclass
class Ad:
department: str
name: str
description: str | None = None
landline: str | None = None
mobile: str | None = None
email: str | None = None
website: str | None = None
img: str | None = None
def iter_departments():
url = 'https://www.les-chambresdhotes.fr/annuaire/nouv_site/mobile/index_page_principal_mob.php'
content = requests.get(url).content.decode()
soup = BeautifulSoup(content)
for link in soup.find_all('a'):
href = link.get('href')
if href and href.startswith('https://www..fr'):
dep = href.split('/')[-2]
if dep:
yield dep
ads = []
for dep in iter_departments():
url = f'https://www.les-chambresdhotes.fr/chambre-d-hotes-{dep}/'
dep_no = dep.split('-')[-1]
content = requests.get(url).content.decode()
soup = BeautifulSoup(content)
for _ad in soup.find_all(name='div', attrs={'class': 'annonce_nouveau'}):
phone_numbers = (
(
tag
.find(name='p')
.get_text(strip=True, separator='\n')
.splitlines()
)
if (tag := _ad.find(name='div', attrs={'class': 'tel_nouveau'}))
else []
)
landline = None
mobile = None
for phone_number in phone_numbers:
if phone_number.startswith('Tel'):
landline = phone_number.split(':')[1].strip()
if phone_number.startswith('Mobile'):
mobile = phone_number.split(':')[1].strip()
links = (
(
tag
.find(name='p')
.get_text(strip=True, separator='\n')
.replace(':\n', ':')
.splitlines()
)
if (tag := _ad.find(name='div', attrs={'class': 'lien_nouveau'}))
else []
)
email = None
website = None
for link in links:
if link.startswith('Mail'):
email = link.split(':')[1].strip()
if link.startswith('Site'):
website = link.split(':')[1].strip()
ad = Ad(
department=dep_no,
name=_ad.find(name='div', attrs={'class': 'titre_nouveau'}).find(name='p').text,
description=_ad.find(name='div', attrs={'class': 'comment_nouveau'}).find(name='p').text,
landline=landline,
mobile=mobile,
email=email,
website=website,
img=(
'https://www.les-chambresdhotes.fr' +
_ad.find(name='div', attrs={'class': 'image_nouveau2'}).find(name='img').get('src')
)
)
ads.append(ad)
pd.DataFrame(ads).to_csv('les-chambresdhotes.fr.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment