Skip to content

Instantly share code, notes, and snippets.

@secemp9
Created January 26, 2024 19:05
Show Gist options
  • Save secemp9/796b970870105f34017ff31210e60838 to your computer and use it in GitHub Desktop.
Save secemp9/796b970870105f34017ff31210e60838 to your computer and use it in GitHub Desktop.
scraping...
import time
from io import BytesIO
import cv2
import numpy as np
import pytesseract
import requests
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib.parse
import xlsxwriter
import re
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def find_general_isbn(text):
# This pattern is for a more generalized match for ISBN-10 and ISBN-13
# ISBN-10: (1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit or 'X')
# ISBN-13: (978 or 979)-(1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit)
isbn_pattern = r'(?:97[89])-?\d{1,5}-?\d{1,7}-?\d{1,7}-?[\dX]'
# Clean up the text to remove newlines and excessive hyphens/spaces
clean_text = re.sub(r'[\s—]+', '', text)
# Find all matches in the clean text
matches = re.findall(isbn_pattern, clean_text)
# Format the matches to the standard ISBN format
formatted_matches = []
for match in matches:
# Remove any non-numeric characters from the match
digits_only = re.sub(r'[^0-9X]', '', match)
# Insert hyphens in the correct places based on length (ISBN-10 or ISBN-13)
if len(digits_only) == 10:
formatted_isbn = f'{digits_only[:1]}-{digits_only[1:6]}-{digits_only[6:9]}-{digits_only[9:]}'
elif len(digits_only) == 13:
formatted_isbn = f'{digits_only[:3]}-{digits_only[3:4]}-{digits_only[4:8]}-{digits_only[8:12]}-{digits_only[12:]}'
else:
continue # Skip invalid ISBNs
formatted_matches.append(formatted_isbn)
return formatted_matches[0] if formatted_matches else None
def process_image(image_url):
# Download the image
response = requests.get(image_url)
image_bytes = BytesIO(response.content)
# Read the image directly from the bytes
im = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_GRAYSCALE)
image_bytes.seek(0) # Reset BytesIO object for reuse
im_out = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_COLOR)
# Rescale the image
scale = 1000.0 / im.shape[1]
im = cv2.resize(im, (int(im.shape[1] * scale), int(im.shape[0] * scale)))
# Apply blackhat morphological operation with a larger kernel
kernel = np.ones((3, 3), np.uint8)
im = cv2.morphologyEx(im, cv2.MORPH_BLACKHAT, kernel)
# Apply threshold
_, im = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Apply morphological operations to help bring out the barcodes
kernel = np.ones((3, 3), np.uint8)
im = cv2.morphologyEx(im, cv2.MORPH_DILATE, kernel, iterations=3)
kernel = np.ones((5, 5), np.uint8)
im = cv2.morphologyEx(im, cv2.MORPH_CLOSE, kernel, iterations=3)
kernel = np.ones((3, 3), np.uint8)
im = cv2.morphologyEx(im, cv2.MORPH_OPEN, kernel, iterations=2)
# Find contours
contours, _ = cv2.findContours(im, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
unscale = 1.0 / scale
detected_texts = []
# Loop over contours to find and draw bounding boxes for barcodes
for contour in contours:
if cv2.contourArea(contour) > 200:
x, y, w, h = cv2.boundingRect(contour)
cv2.rectangle(im_out, (int(x * unscale), int(y * unscale)),
(int((x + w) * unscale), int((y + h) * unscale)),
(0, 255, 0), 2)
# Extract the region of interest and apply OCR
roi = im_out[int(y * unscale):int((y + h) * unscale), int(x * unscale):int((x + w) * unscale)]
text = pytesseract.image_to_string(roi)
print(text)
if text.strip():
wow = find_general_isbn(text.strip())
if wow:
detected_texts.append(wow)
# print(f"Detected text: {text}")
return detected_texts
# Base URL
base_url = 'https://www.kleinanzeigen.de/s-buecher-zeitschriften/seite:{}/c76'
# Initialize WebDriver
driver = webdriver.Chrome()
# Initialize list to store book links
all_book_links = []
# Loop through pages 1 to 50
for page in range(1, 2):
# Construct URL for the current page
url = base_url.format(page)
# Navigate to the URL
driver.get(url)
# Optional: Wait for the page to load
try:
element_present = EC.presence_of_element_located((By.CLASS_NAME, 'ellipsis'))
WebDriverWait(driver, 10).until(element_present)
except TimeoutException:
print(f"Timed out waiting for page {page} to load")
# Get HTML content
html_content = driver.page_source
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Find all relevant book links
book_links = soup.find_all('a', class_='ellipsis', href=lambda href: href and '/s-anzeige/' in href)
book_links_info = [{'href': link.get('href'), 'text': link.get_text(strip=True)} for link in book_links]
# Append to the all_book_links list
all_book_links.extend(book_links_info)
time.sleep(1)
# Form the complete URLs
complete_urls = [{'href': urllib.parse.urljoin(base_url, link['href']), 'text': link['text']} for link in
all_book_links]
for i in complete_urls:
# print(i)
driver.get(i["href"])
# Optional: Wait for the page to load
try:
element_present = EC.presence_of_element_located((By.CLASS_NAME, 'galleryimage-element'))
WebDriverWait(driver, 10).until(element_present)
except TimeoutException:
print(f"Timed out waiting for page {page} to load")
time.sleep(1)
# Get HTML content
html_content = driver.page_source
# Create a BeautifulSoup object and parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Find all image tags
images = soup.find_all('img')
# Extract the 'src' attribute of each image tag, only if it ends with '.JPG'
image_links = [img['src'] for img in images if 'src' in img.attrs and img['src'].endswith('.JPG')]
all_texts = []
for link in image_links:
detected_texts = process_image(link)
all_texts.extend(detected_texts)
# Add the OCR texts to the dictionary
i['ocr'] = all_texts
driver.quit()
# Create a new Excel file and add a worksheet
workbook = xlsxwriter.Workbook('book_links.xlsx')
worksheet = workbook.add_worksheet()
# Write headers
worksheet.write('A1', 'Book Title')
worksheet.write('B1', 'URL')
worksheet.write('C1', 'ISBN')
# Write data
row = 1
for link in complete_urls:
worksheet.write(row, 0, link['text'])
worksheet.write(row, 1, link['href'])
worksheet.write(row, 2, link['ocr'])
row += 1
# Close the Excel file
workbook.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment