secemp9 · January 26, 2024 19:05
diff --git a/scrape_test.py b/scrape_test.py
 import time
 from io import BytesIO
 import cv2
 import numpy as np
 import pytesseract
 import requests
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
 import urllib.parse
 import xlsxwriter
 import re

 pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
 def find_general_isbn(text):
    # This pattern is for a more generalized match for ISBN-10 and ISBN-13
    # ISBN-10: (1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit or 'X')
    # ISBN-13: (978 or 979)-(1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit)
    isbn_pattern = r'(?:97[89])-?\d{1,5}-?\d{1,7}-?\d{1,7}-?[\dX]'

    # Clean up the text to remove newlines and excessive hyphens/spaces
    clean_text = re.sub(r'[\s—]+', '', text)

    # Find all matches in the clean text
    matches = re.findall(isbn_pattern, clean_text)

    # Format the matches to the standard ISBN format
    formatted_matches = []
    for match in matches:
        # Remove any non-numeric characters from the match
        digits_only = re.sub(r'[^0-9X]', '', match)

        # Insert hyphens in the correct places based on length (ISBN-10 or ISBN-13)
        if len(digits_only) == 10:
            formatted_isbn = f'{digits_only[:1]}-{digits_only[1:6]}-{digits_only[6:9]}-{digits_only[9:]}'
        elif len(digits_only) == 13:
            formatted_isbn = f'{digits_only[:3]}-{digits_only[3:4]}-{digits_only[4:8]}-{digits_only[8:12]}-{digits_only[12:]}'
        else:
            continue  # Skip invalid ISBNs

        formatted_matches.append(formatted_isbn)

    return formatted_matches[0] if formatted_matches else None


 def process_image(image_url):
    # Download the image
    response = requests.get(image_url)
    image_bytes = BytesIO(response.content)

    # Read the image directly from the bytes
    im = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_GRAYSCALE)
    image_bytes.seek(0)  # Reset BytesIO object for reuse
    im_out = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_COLOR)

    # Rescale the image
    scale = 1000.0 / im.shape[1]
    im = cv2.resize(im, (int(im.shape[1] * scale), int(im.shape[0] * scale)))

    # Apply blackhat morphological operation with a larger kernel
    kernel = np.ones((3, 3), np.uint8)
    im = cv2.morphologyEx(im, cv2.MORPH_BLACKHAT, kernel)

    # Apply threshold
    _, im = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Apply morphological operations to help bring out the barcodes
    kernel = np.ones((3, 3), np.uint8)
    im = cv2.morphologyEx(im, cv2.MORPH_DILATE, kernel, iterations=3)

    kernel = np.ones((5, 5), np.uint8)
    im = cv2.morphologyEx(im, cv2.MORPH_CLOSE, kernel, iterations=3)

    kernel = np.ones((3, 3), np.uint8)
    im = cv2.morphologyEx(im, cv2.MORPH_OPEN, kernel, iterations=2)

    # Find contours
    contours, _ = cv2.findContours(im, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    unscale = 1.0 / scale

    detected_texts = []
    # Loop over contours to find and draw bounding boxes for barcodes
    for contour in contours:
        if cv2.contourArea(contour) > 200:
            x, y, w, h = cv2.boundingRect(contour)
            cv2.rectangle(im_out, (int(x * unscale), int(y * unscale)),
                          (int((x + w) * unscale), int((y + h) * unscale)),
                          (0, 255, 0), 2)

            # Extract the region of interest and apply OCR
            roi = im_out[int(y * unscale):int((y + h) * unscale), int(x * unscale):int((x + w) * unscale)]
            text = pytesseract.image_to_string(roi)
            print(text)
            if text.strip():
                wow = find_general_isbn(text.strip())
                if wow:
                    detected_texts.append(wow)
                # print(f"Detected text: {text}")
    return detected_texts
 # Base URL
 base_url = 'https://www.kleinanzeigen.de/s-buecher-zeitschriften/seite:{}/c76'

 # Initialize WebDriver
 driver = webdriver.Chrome()

 # Initialize list to store book links
 all_book_links = []

 # Loop through pages 1 to 50
 for page in range(1, 2):
    # Construct URL for the current page
    url = base_url.format(page)

    # Navigate to the URL
    driver.get(url)

    # Optional: Wait for the page to load
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'ellipsis'))
        WebDriverWait(driver, 10).until(element_present)
    except TimeoutException:
        print(f"Timed out waiting for page {page} to load")

    # Get HTML content
    html_content = driver.page_source

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all relevant book links
    book_links = soup.find_all('a', class_='ellipsis', href=lambda href: href and '/s-anzeige/' in href)
    book_links_info = [{'href': link.get('href'), 'text': link.get_text(strip=True)} for link in book_links]

    # Append to the all_book_links list
    all_book_links.extend(book_links_info)
    time.sleep(1)



 # Form the complete URLs
 complete_urls = [{'href': urllib.parse.urljoin(base_url, link['href']), 'text': link['text']} for link in
                 all_book_links]


 for i in complete_urls:
    # print(i)
    driver.get(i["href"])

    # Optional: Wait for the page to load
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'galleryimage-element'))
        WebDriverWait(driver, 10).until(element_present)
    except TimeoutException:
        print(f"Timed out waiting for page {page} to load")
    time.sleep(1)

    # Get HTML content
    html_content = driver.page_source

    # Create a BeautifulSoup object and parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all image tags
    images = soup.find_all('img')

    # Extract the 'src' attribute of each image tag, only if it ends with '.JPG'
    image_links = [img['src'] for img in images if 'src' in img.attrs and img['src'].endswith('.JPG')]

    all_texts = []
    for link in image_links:
        detected_texts = process_image(link)
        all_texts.extend(detected_texts)

    # Add the OCR texts to the dictionary
    i['ocr'] = all_texts

 driver.quit()
 # Create a new Excel file and add a worksheet
 workbook = xlsxwriter.Workbook('book_links.xlsx')
 worksheet = workbook.add_worksheet()

 # Write headers
 worksheet.write('A1', 'Book Title')
 worksheet.write('B1', 'URL')
 worksheet.write('C1', 'ISBN')

 # Write data
 row = 1
 for link in complete_urls:
    worksheet.write(row, 0, link['text'])
    worksheet.write(row, 1, link['href'])
    worksheet.write(row, 2, link['ocr'])
    row += 1

 # Close the Excel file
 workbook.close()
	import time
	from io import BytesIO
	import cv2
	import numpy as np
	import pytesseract
	import requests
	from selenium import webdriver
	from selenium.common.exceptions import TimeoutException
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from bs4 import BeautifulSoup
	import urllib.parse
	import xlsxwriter
	import re

	pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
	def find_general_isbn(text):
	# This pattern is for a more generalized match for ISBN-10 and ISBN-13
	# ISBN-10: (1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit or 'X')
	# ISBN-13: (978 or 979)-(1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit)
	isbn_pattern = r'(?:97[89])-?\d{1,5}-?\d{1,7}-?\d{1,7}-?[\dX]'

	# Clean up the text to remove newlines and excessive hyphens/spaces
	clean_text = re.sub(r'[\s—]+', '', text)

	# Find all matches in the clean text
	matches = re.findall(isbn_pattern, clean_text)

	# Format the matches to the standard ISBN format
	formatted_matches = []
	for match in matches:
	# Remove any non-numeric characters from the match
	digits_only = re.sub(r'[^0-9X]', '', match)

	# Insert hyphens in the correct places based on length (ISBN-10 or ISBN-13)
	if len(digits_only) == 10:
	formatted_isbn = f'{digits_only[:1]}-{digits_only[1:6]}-{digits_only[6:9]}-{digits_only[9:]}'
	elif len(digits_only) == 13:
	formatted_isbn = f'{digits_only[:3]}-{digits_only[3:4]}-{digits_only[4:8]}-{digits_only[8:12]}-{digits_only[12:]}'
	else:
	continue # Skip invalid ISBNs

	formatted_matches.append(formatted_isbn)

	return formatted_matches[0] if formatted_matches else None


	def process_image(image_url):
	# Download the image
	response = requests.get(image_url)
	image_bytes = BytesIO(response.content)

	# Read the image directly from the bytes
	im = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_GRAYSCALE)
	image_bytes.seek(0) # Reset BytesIO object for reuse
	im_out = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_COLOR)

	# Rescale the image
	scale = 1000.0 / im.shape[1]
	im = cv2.resize(im, (int(im.shape[1] * scale), int(im.shape[0] * scale)))

	# Apply blackhat morphological operation with a larger kernel
	kernel = np.ones((3, 3), np.uint8)
	im = cv2.morphologyEx(im, cv2.MORPH_BLACKHAT, kernel)

	# Apply threshold
	_, im = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

	# Apply morphological operations to help bring out the barcodes
	kernel = np.ones((3, 3), np.uint8)
	im = cv2.morphologyEx(im, cv2.MORPH_DILATE, kernel, iterations=3)

	kernel = np.ones((5, 5), np.uint8)
	im = cv2.morphologyEx(im, cv2.MORPH_CLOSE, kernel, iterations=3)

	kernel = np.ones((3, 3), np.uint8)
	im = cv2.morphologyEx(im, cv2.MORPH_OPEN, kernel, iterations=2)

	# Find contours
	contours, _ = cv2.findContours(im, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
	unscale = 1.0 / scale

	detected_texts = []
	# Loop over contours to find and draw bounding boxes for barcodes
	for contour in contours:
	if cv2.contourArea(contour) > 200:
	x, y, w, h = cv2.boundingRect(contour)
	cv2.rectangle(im_out, (int(x * unscale), int(y * unscale)),
	(int((x + w) * unscale), int((y + h) * unscale)),
	(0, 255, 0), 2)

	# Extract the region of interest and apply OCR
	roi = im_out[int(y * unscale):int((y + h) * unscale), int(x * unscale):int((x + w) * unscale)]
	text = pytesseract.image_to_string(roi)
	print(text)
	if text.strip():
	wow = find_general_isbn(text.strip())
	if wow:
	detected_texts.append(wow)
	# print(f"Detected text: {text}")
	return detected_texts
	# Base URL
	base_url = 'https://www.kleinanzeigen.de/s-buecher-zeitschriften/seite:{}/c76'

	# Initialize WebDriver
	driver = webdriver.Chrome()

	# Initialize list to store book links
	all_book_links = []

	# Loop through pages 1 to 50
	for page in range(1, 2):
	# Construct URL for the current page
	url = base_url.format(page)

	# Navigate to the URL
	driver.get(url)

	# Optional: Wait for the page to load
	try:
	element_present = EC.presence_of_element_located((By.CLASS_NAME, 'ellipsis'))
	WebDriverWait(driver, 10).until(element_present)
	except TimeoutException:
	print(f"Timed out waiting for page {page} to load")

	# Get HTML content
	html_content = driver.page_source

	# Parse HTML with BeautifulSoup
	soup = BeautifulSoup(html_content, 'html.parser')

	# Find all relevant book links
	book_links = soup.find_all('a', class_='ellipsis', href=lambda href: href and '/s-anzeige/' in href)
	book_links_info = [{'href': link.get('href'), 'text': link.get_text(strip=True)} for link in book_links]

	# Append to the all_book_links list
	all_book_links.extend(book_links_info)
	time.sleep(1)



	# Form the complete URLs
	complete_urls = [{'href': urllib.parse.urljoin(base_url, link['href']), 'text': link['text']} for link in
	all_book_links]


	for i in complete_urls:
	# print(i)
	driver.get(i["href"])

	# Optional: Wait for the page to load
	try:
	element_present = EC.presence_of_element_located((By.CLASS_NAME, 'galleryimage-element'))
	WebDriverWait(driver, 10).until(element_present)
	except TimeoutException:
	print(f"Timed out waiting for page {page} to load")
	time.sleep(1)

	# Get HTML content
	html_content = driver.page_source

	# Create a BeautifulSoup object and parse the HTML content
	soup = BeautifulSoup(html_content, 'html.parser')

	# Find all image tags
	images = soup.find_all('img')

	# Extract the 'src' attribute of each image tag, only if it ends with '.JPG'
	image_links = [img['src'] for img in images if 'src' in img.attrs and img['src'].endswith('.JPG')]

	all_texts = []
	for link in image_links:
	detected_texts = process_image(link)
	all_texts.extend(detected_texts)

	# Add the OCR texts to the dictionary
	i['ocr'] = all_texts

	driver.quit()
	# Create a new Excel file and add a worksheet
	workbook = xlsxwriter.Workbook('book_links.xlsx')
	worksheet = workbook.add_worksheet()

	# Write headers
	worksheet.write('A1', 'Book Title')
	worksheet.write('B1', 'URL')
	worksheet.write('C1', 'ISBN')

	# Write data
	row = 1
	for link in complete_urls:
	worksheet.write(row, 0, link['text'])
	worksheet.write(row, 1, link['href'])
	worksheet.write(row, 2, link['ocr'])
	row += 1

	# Close the Excel file
	workbook.close()