Created
January 26, 2024 19:05
-
-
Save secemp9/796b970870105f34017ff31210e60838 to your computer and use it in GitHub Desktop.
scraping...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| from io import BytesIO | |
| import cv2 | |
| import numpy as np | |
| import pytesseract | |
| import requests | |
| from selenium import webdriver | |
| from selenium.common.exceptions import TimeoutException | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from bs4 import BeautifulSoup | |
| import urllib.parse | |
| import xlsxwriter | |
| import re | |
| pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
| def find_general_isbn(text): | |
| # This pattern is for a more generalized match for ISBN-10 and ISBN-13 | |
| # ISBN-10: (1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit or 'X') | |
| # ISBN-13: (978 or 979)-(1-5 digits)-(1-7 digits)-(1-6 digits)-(1 digit) | |
| isbn_pattern = r'(?:97[89])-?\d{1,5}-?\d{1,7}-?\d{1,7}-?[\dX]' | |
| # Clean up the text to remove newlines and excessive hyphens/spaces | |
| clean_text = re.sub(r'[\s—]+', '', text) | |
| # Find all matches in the clean text | |
| matches = re.findall(isbn_pattern, clean_text) | |
| # Format the matches to the standard ISBN format | |
| formatted_matches = [] | |
| for match in matches: | |
| # Remove any non-numeric characters from the match | |
| digits_only = re.sub(r'[^0-9X]', '', match) | |
| # Insert hyphens in the correct places based on length (ISBN-10 or ISBN-13) | |
| if len(digits_only) == 10: | |
| formatted_isbn = f'{digits_only[:1]}-{digits_only[1:6]}-{digits_only[6:9]}-{digits_only[9:]}' | |
| elif len(digits_only) == 13: | |
| formatted_isbn = f'{digits_only[:3]}-{digits_only[3:4]}-{digits_only[4:8]}-{digits_only[8:12]}-{digits_only[12:]}' | |
| else: | |
| continue # Skip invalid ISBNs | |
| formatted_matches.append(formatted_isbn) | |
| return formatted_matches[0] if formatted_matches else None | |
| def process_image(image_url): | |
| # Download the image | |
| response = requests.get(image_url) | |
| image_bytes = BytesIO(response.content) | |
| # Read the image directly from the bytes | |
| im = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_GRAYSCALE) | |
| image_bytes.seek(0) # Reset BytesIO object for reuse | |
| im_out = cv2.imdecode(np.frombuffer(image_bytes.read(), np.uint8), cv2.IMREAD_COLOR) | |
| # Rescale the image | |
| scale = 1000.0 / im.shape[1] | |
| im = cv2.resize(im, (int(im.shape[1] * scale), int(im.shape[0] * scale))) | |
| # Apply blackhat morphological operation with a larger kernel | |
| kernel = np.ones((3, 3), np.uint8) | |
| im = cv2.morphologyEx(im, cv2.MORPH_BLACKHAT, kernel) | |
| # Apply threshold | |
| _, im = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) | |
| # Apply morphological operations to help bring out the barcodes | |
| kernel = np.ones((3, 3), np.uint8) | |
| im = cv2.morphologyEx(im, cv2.MORPH_DILATE, kernel, iterations=3) | |
| kernel = np.ones((5, 5), np.uint8) | |
| im = cv2.morphologyEx(im, cv2.MORPH_CLOSE, kernel, iterations=3) | |
| kernel = np.ones((3, 3), np.uint8) | |
| im = cv2.morphologyEx(im, cv2.MORPH_OPEN, kernel, iterations=2) | |
| # Find contours | |
| contours, _ = cv2.findContours(im, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) | |
| unscale = 1.0 / scale | |
| detected_texts = [] | |
| # Loop over contours to find and draw bounding boxes for barcodes | |
| for contour in contours: | |
| if cv2.contourArea(contour) > 200: | |
| x, y, w, h = cv2.boundingRect(contour) | |
| cv2.rectangle(im_out, (int(x * unscale), int(y * unscale)), | |
| (int((x + w) * unscale), int((y + h) * unscale)), | |
| (0, 255, 0), 2) | |
| # Extract the region of interest and apply OCR | |
| roi = im_out[int(y * unscale):int((y + h) * unscale), int(x * unscale):int((x + w) * unscale)] | |
| text = pytesseract.image_to_string(roi) | |
| print(text) | |
| if text.strip(): | |
| wow = find_general_isbn(text.strip()) | |
| if wow: | |
| detected_texts.append(wow) | |
| # print(f"Detected text: {text}") | |
| return detected_texts | |
| # Base URL | |
| base_url = 'https://www.kleinanzeigen.de/s-buecher-zeitschriften/seite:{}/c76' | |
| # Initialize WebDriver | |
| driver = webdriver.Chrome() | |
| # Initialize list to store book links | |
| all_book_links = [] | |
| # Loop through pages 1 to 50 | |
| for page in range(1, 2): | |
| # Construct URL for the current page | |
| url = base_url.format(page) | |
| # Navigate to the URL | |
| driver.get(url) | |
| # Optional: Wait for the page to load | |
| try: | |
| element_present = EC.presence_of_element_located((By.CLASS_NAME, 'ellipsis')) | |
| WebDriverWait(driver, 10).until(element_present) | |
| except TimeoutException: | |
| print(f"Timed out waiting for page {page} to load") | |
| # Get HTML content | |
| html_content = driver.page_source | |
| # Parse HTML with BeautifulSoup | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Find all relevant book links | |
| book_links = soup.find_all('a', class_='ellipsis', href=lambda href: href and '/s-anzeige/' in href) | |
| book_links_info = [{'href': link.get('href'), 'text': link.get_text(strip=True)} for link in book_links] | |
| # Append to the all_book_links list | |
| all_book_links.extend(book_links_info) | |
| time.sleep(1) | |
| # Form the complete URLs | |
| complete_urls = [{'href': urllib.parse.urljoin(base_url, link['href']), 'text': link['text']} for link in | |
| all_book_links] | |
| for i in complete_urls: | |
| # print(i) | |
| driver.get(i["href"]) | |
| # Optional: Wait for the page to load | |
| try: | |
| element_present = EC.presence_of_element_located((By.CLASS_NAME, 'galleryimage-element')) | |
| WebDriverWait(driver, 10).until(element_present) | |
| except TimeoutException: | |
| print(f"Timed out waiting for page {page} to load") | |
| time.sleep(1) | |
| # Get HTML content | |
| html_content = driver.page_source | |
| # Create a BeautifulSoup object and parse the HTML content | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Find all image tags | |
| images = soup.find_all('img') | |
| # Extract the 'src' attribute of each image tag, only if it ends with '.JPG' | |
| image_links = [img['src'] for img in images if 'src' in img.attrs and img['src'].endswith('.JPG')] | |
| all_texts = [] | |
| for link in image_links: | |
| detected_texts = process_image(link) | |
| all_texts.extend(detected_texts) | |
| # Add the OCR texts to the dictionary | |
| i['ocr'] = all_texts | |
| driver.quit() | |
| # Create a new Excel file and add a worksheet | |
| workbook = xlsxwriter.Workbook('book_links.xlsx') | |
| worksheet = workbook.add_worksheet() | |
| # Write headers | |
| worksheet.write('A1', 'Book Title') | |
| worksheet.write('B1', 'URL') | |
| worksheet.write('C1', 'ISBN') | |
| # Write data | |
| row = 1 | |
| for link in complete_urls: | |
| worksheet.write(row, 0, link['text']) | |
| worksheet.write(row, 1, link['href']) | |
| worksheet.write(row, 2, link['ocr']) | |
| row += 1 | |
| # Close the Excel file | |
| workbook.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment