MaxHalford · August 10, 2023 12:36
diff --git a/annuairechambresdhotes.com.py b/annuairechambresdhotes.com.py
 import requests
 from bs4 import BeautifulSoup
 from PIL import Image
 import pytesseract
 from io import BytesIO
 from dataclasses import dataclass
 from tqdm import tqdm

 @dataclass
 class Ad:
    name: str 
    mobile: str | None = None
    email: str | None = None
    website: str | None = None

 ads = []

 link = "http://www.annuairechambresdhotes.com"
 content = requests.get(link).content.decode()
 soup = BeautifulSoup(content)

 for a in tqdm(soup.find_all('a', href=True)):
    if not a['href'].startswith('http://www.annuairechambresdhotes.com/'):
        continue

    link = a['href']
    content = requests.get(link).content.decode()
    soup = BeautifulSoup(content)

    for listing in soup.find_all("div", {"class": "item-list"}):

        # Extract email
        lis = [li for li in listing.find_all("li") if li.text.startswith("Email")]
        if lis:
            li = lis[0]
            img_link = li.find("img")["src"]
            img_link = f"http://www.annuairechambresdhotes.com{img_link}"
            img_response = requests.get(img_link)
            img = Image.open(BytesIO(img_response.content), formats=["png"])
            email_text = pytesseract.image_to_string(img).strip()
        else:
            email_text = None

        # Extract phone
        lis = [li for li in listing.find_all("li") if li.text.startswith("Tel")]
        if lis:
            li = lis[0]
            img_link = li.find("img")["src"]
            img_link = f"http://www.annuairechambresdhotes.com{img_link}"
            img_response = requests.get(img_link)
            img = Image.open(BytesIO(img_response.content), formats=["png"])
            phone_text = pytesseract.image_to_string(img).strip()
        else:
            phone_text = None

        # Extract website
        li = [li for li in listing.find_all("li") if li.text.startswith("Site internet")][0]
        website = li.find("span").text
        
        ad = Ad(
            name=listing.find("h3").find("a").text,
            email=email_text,
            mobile=phone_text,
            website=website
        )
        ads.append(ad)

 pd.DataFrame(ads).to_csv('annuairechambresdhotes.com.csv', index=False)
diff --git a/chambresdhotes-france.fr.py b/chambresdhotes-france.fr.py
 from bs4 import BeautifulSoup
 import requests
 from dataclasses import dataclass, asdict
 from tqdm import tqdm

 @dataclass
 class Ad:
    name: str 
    mobile: str | None = None
    email: str | None = None
    website: str | None = None
    link: str | None = None

 ads = []
 for page_no in tqdm(range(1, 9)):
    url = f'https://www.chambresdhotes-france.fr/adresse/page/{page_no}/'
    content = requests.get(url).content.decode()
    soup = BeautifulSoup(content)
    for article in soup.find_all('article'):
        link = article.get('data-permalink')
        content = requests.get(link).content.decode()
        soup = BeautifulSoup(content)

        try:
            email = soup.find("li", {"id": "listing-email"}).find('a').text
        except AttributeError:
            email = None
        try:
            website = soup.find("li", {"id": "listing-website"}).find('a').get('href')
        except AttributeError:
            website = None
        try:
            mobile = soup.find("li", {"class": "listing-phone"}).find('a').text
        except AttributeError:
            mobile = None

        ad = Ad(
            name=soup.find("h1", {"class": "entry-title"}).text,
            mobile=mobile,
            email=email,
            website=website,
            link=link
        )
        ads.append(ad)

 pd.DataFrame(ads).to_csv('chambresdhotes-france.fr.csv', index=False)
diff --git a/les-chambresdhotes.fr.py b/les-chambresdhotes.fr.py
 import pandas as pd
 from dataclasses import dataclass, asdict
 from bs4 import BeautifulSoup
 import requests

 @dataclass
 class Ad:
    department: str
    name: str 
    description: str | None = None
    landline: str | None = None
    mobile: str | None = None
    email: str | None = None
    website: str | None = None
    img: str | None = None

 def iter_departments():
    url = 'https://www.les-chambresdhotes.fr/annuaire/nouv_site/mobile/index_page_principal_mob.php'
    content = requests.get(url).content.decode()
    soup = BeautifulSoup(content)
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith('https://www..fr'):
            dep = href.split('/')[-2]
            if dep:
                yield dep

 ads = []

 for dep in iter_departments():
    url = f'https://www.les-chambresdhotes.fr/chambre-d-hotes-{dep}/'
    dep_no = dep.split('-')[-1]
    content = requests.get(url).content.decode()
    soup = BeautifulSoup(content)

    for _ad in soup.find_all(name='div', attrs={'class': 'annonce_nouveau'}):

        phone_numbers = (
            (
                tag
                .find(name='p')
                .get_text(strip=True, separator='\n')
                .splitlines()
            )
            if (tag := _ad.find(name='div', attrs={'class': 'tel_nouveau'}))
            else []
        )
        landline = None
        mobile = None
        for phone_number in phone_numbers:
            if phone_number.startswith('Tel'):
                landline = phone_number.split(':')[1].strip()
            if phone_number.startswith('Mobile'):
                mobile = phone_number.split(':')[1].strip()

        
        links = (
            (
                tag
                .find(name='p')
                .get_text(strip=True, separator='\n')
                .replace(':\n', ':')
                .splitlines()
            )
            if (tag := _ad.find(name='div', attrs={'class': 'lien_nouveau'}))
            else []
        )
        email = None
        website = None
        for link in links:
            if link.startswith('Mail'):
                email = link.split(':')[1].strip()
            if link.startswith('Site'):
                website = link.split(':')[1].strip()

        ad = Ad(
            department=dep_no,
            name=_ad.find(name='div', attrs={'class': 'titre_nouveau'}).find(name='p').text,
            description=_ad.find(name='div', attrs={'class': 'comment_nouveau'}).find(name='p').text,
            landline=landline,
            mobile=mobile,
            email=email,
            website=website,
            img=(
                'https://www.les-chambresdhotes.fr' +
                _ad.find(name='div', attrs={'class': 'image_nouveau2'}).find(name='img').get('src')
            )
        )

        ads.append(ad)

 pd.DataFrame(ads).to_csv('les-chambresdhotes.fr.csv', index=False)
	import requests
	from bs4 import BeautifulSoup
	from PIL import Image
	import pytesseract
	from io import BytesIO
	from dataclasses import dataclass
	from tqdm import tqdm

	@dataclass
	class Ad:
	name: str
	mobile: str \| None = None
	email: str \| None = None
	website: str \| None = None

	ads = []

	link = "http://www.annuairechambresdhotes.com"
	content = requests.get(link).content.decode()
	soup = BeautifulSoup(content)

	for a in tqdm(soup.find_all('a', href=True)):
	if not a['href'].startswith('http://www.annuairechambresdhotes.com/'):
	continue

	link = a['href']
	content = requests.get(link).content.decode()
	soup = BeautifulSoup(content)

	for listing in soup.find_all("div", {"class": "item-list"}):

	# Extract email
	lis = [li for li in listing.find_all("li") if li.text.startswith("Email")]
	if lis:
	li = lis[0]
	img_link = li.find("img")["src"]
	img_link = f"http://www.annuairechambresdhotes.com{img_link}"
	img_response = requests.get(img_link)
	img = Image.open(BytesIO(img_response.content), formats=["png"])
	email_text = pytesseract.image_to_string(img).strip()
	else:
	email_text = None

	# Extract phone
	lis = [li for li in listing.find_all("li") if li.text.startswith("Tel")]
	if lis:
	li = lis[0]
	img_link = li.find("img")["src"]
	img_link = f"http://www.annuairechambresdhotes.com{img_link}"
	img_response = requests.get(img_link)
	img = Image.open(BytesIO(img_response.content), formats=["png"])
	phone_text = pytesseract.image_to_string(img).strip()
	else:
	phone_text = None

	# Extract website
	li = [li for li in listing.find_all("li") if li.text.startswith("Site internet")][0]
	website = li.find("span").text

	ad = Ad(
	name=listing.find("h3").find("a").text,
	email=email_text,
	mobile=phone_text,
	website=website
	)
	ads.append(ad)

	pd.DataFrame(ads).to_csv('annuairechambresdhotes.com.csv', index=False)
	from bs4 import BeautifulSoup
	import requests
	from dataclasses import dataclass, asdict
	from tqdm import tqdm

	@dataclass
	class Ad:
	name: str
	mobile: str \| None = None
	email: str \| None = None
	website: str \| None = None
	link: str \| None = None

	ads = []
	for page_no in tqdm(range(1, 9)):
	url = f'https://www.chambresdhotes-france.fr/adresse/page/{page_no}/'
	content = requests.get(url).content.decode()
	soup = BeautifulSoup(content)
	for article in soup.find_all('article'):
	link = article.get('data-permalink')
	content = requests.get(link).content.decode()
	soup = BeautifulSoup(content)

	try:
	email = soup.find("li", {"id": "listing-email"}).find('a').text
	except AttributeError:
	email = None
	try:
	website = soup.find("li", {"id": "listing-website"}).find('a').get('href')
	except AttributeError:
	website = None
	try:
	mobile = soup.find("li", {"class": "listing-phone"}).find('a').text
	except AttributeError:
	mobile = None

	ad = Ad(
	name=soup.find("h1", {"class": "entry-title"}).text,
	mobile=mobile,
	email=email,
	website=website,
	link=link
	)
	ads.append(ad)

	pd.DataFrame(ads).to_csv('chambresdhotes-france.fr.csv', index=False)
	import pandas as pd
	from dataclasses import dataclass, asdict
	from bs4 import BeautifulSoup
	import requests

	@dataclass
	class Ad:
	department: str
	name: str
	description: str \| None = None
	landline: str \| None = None
	mobile: str \| None = None
	email: str \| None = None
	website: str \| None = None
	img: str \| None = None

	def iter_departments():
	url = 'https://www.les-chambresdhotes.fr/annuaire/nouv_site/mobile/index_page_principal_mob.php'
	content = requests.get(url).content.decode()
	soup = BeautifulSoup(content)
	for link in soup.find_all('a'):
	href = link.get('href')
	if href and href.startswith('https://www..fr'):
	dep = href.split('/')[-2]
	if dep:
	yield dep

	ads = []

	for dep in iter_departments():
	url = f'https://www.les-chambresdhotes.fr/chambre-d-hotes-{dep}/'
	dep_no = dep.split('-')[-1]
	content = requests.get(url).content.decode()
	soup = BeautifulSoup(content)

	for _ad in soup.find_all(name='div', attrs={'class': 'annonce_nouveau'}):

	phone_numbers = (
	(
	tag
	.find(name='p')
	.get_text(strip=True, separator='\n')
	.splitlines()
	)
	if (tag := _ad.find(name='div', attrs={'class': 'tel_nouveau'}))
	else []
	)
	landline = None
	mobile = None
	for phone_number in phone_numbers:
	if phone_number.startswith('Tel'):
	landline = phone_number.split(':')[1].strip()
	if phone_number.startswith('Mobile'):
	mobile = phone_number.split(':')[1].strip()


	links = (
	(
	tag
	.find(name='p')
	.get_text(strip=True, separator='\n')
	.replace(':\n', ':')
	.splitlines()
	)
	if (tag := _ad.find(name='div', attrs={'class': 'lien_nouveau'}))
	else []
	)
	email = None
	website = None
	for link in links:
	if link.startswith('Mail'):
	email = link.split(':')[1].strip()
	if link.startswith('Site'):
	website = link.split(':')[1].strip()

	ad = Ad(
	department=dep_no,
	name=_ad.find(name='div', attrs={'class': 'titre_nouveau'}).find(name='p').text,
	description=_ad.find(name='div', attrs={'class': 'comment_nouveau'}).find(name='p').text,
	landline=landline,
	mobile=mobile,
	email=email,
	website=website,
	img=(
	'https://www.les-chambresdhotes.fr' +
	_ad.find(name='div', attrs={'class': 'image_nouveau2'}).find(name='img').get('src')
	)
	)

	ads.append(ad)

	pd.DataFrame(ads).to_csv('les-chambresdhotes.fr.csv', index=False)