Skip to content

Instantly share code, notes, and snippets.

@lobstrio
lobstrio / tripadvisor_mail.py
Last active November 23, 2023 13:21
Extract dynamically @mail on Tripadvisor.com, using Python 3, Request, and lxm
#!/usr/bin/python3
# coding: utf-8
import requests
from lxml import html
import datetime
import re
import argparse
@lobstrio
lobstrio / pagesjaunes_extract.py
Created November 21, 2018 19:05
Extract name and phone on PageJaunes.fr through Python 3, Request and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
import csv
from lxml import html
import datetime
import argparse
@lobstrio
lobstrio / twitter_dtrump.py
Last active January 8, 2021 14:13
Really simple Web Scraping Python Script for the first Tweets of Donald Trump using Requests, and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
from lxml import html
def extract():
"""
Export all Tweets from @realDonaldTrump
@lobstrio
lobstrio / lemonde_headlines.py
Created December 14, 2018 14:36
Extract headlines from French Media website lemonde.fr with Python3, Requests, and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
from lxml import html
import re
import csv
from collections import Counter
class LeMondeScraper:
@lobstrio
lobstrio / amazon_xmas.py
Created December 20, 2018 13:58
Web Scraping Python Script for the Xmas Deals on Amazon using Requests
# -*- coding: utf-8 -*-
# Copyright(C) 2018 Sasha Bouloudnine
import requests
import sys
import re
import ast
import json
import time
@lobstrio
lobstrio / lacentrale_scraper.py
Created April 15, 2021 18:33
Collect BMW vehicle data on lacentrale.fr
# -*- coding: utf-8 -*-
# Copyright(C) 2021 Sasha Bouloudnine
import requests
from lxml import html
import csv
class CrawlerLaCentrale():
@lobstrio
lobstrio / google_maps_scraping_selenium.py
Created August 3, 2021 17:31
Collect all data from a Search URL on Google Maps 👋
# _*_ coding: utf-8 _*°
# Copyright(C) 2021 lobstr
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import csv
@lobstrio
lobstrio / pappers_pdf_parser_with_python_and_tika.py
Last active April 14, 2023 19:41
Scrape PDFs programmatically site with Python3 and Tika library
from tika import parser
import re
import csv
HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']
def parse_pdf(filename):
# request
raw = parser.from_file(filename)
@lobstrio
lobstrio / download_ebooks_from_onion_link_python3_requests.py
Created March 24, 2023 17:23
Download free anarchist ebooks from an .onion site with Python3 and requests 🧅
@lobstrio
lobstrio / bypass_simple_captcha_pytesseract.py
Created April 14, 2023 19:42
Bypass a (simple) CAPTCHA with Python3 and pytesseract 🤖
import cv2
from pytesseract import image_to_string
# pip3 install opencv-python
# pip3 install pytesseract
# brew install tesseract
filename = 'lobstr.jpeg'
img = cv2.imread(filename)
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)