Created
November 24, 2024 07:20
-
-
Save mhsharifi96/c1303a1ed6ae51e2579c196f247438cc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" این کد جهت یافتن شکار گنج دیجی کالاست""" | |
import os | |
import requests | |
from multiprocessing import Pool, cpu_count | |
import random | |
# Create a folder to save images | |
os.makedirs("digikala_images", exist_ok=True) | |
os.makedirs("digikala_images/final", exist_ok=True) | |
import shutil | |
from PIL import Image | |
import pytesseract | |
def find_text_in_image(image_path, product_id, target_text=''): | |
"""Check if the target text exists in the image.""" | |
try: | |
# Open the image | |
img = Image.open(image_path) | |
# Perform OCR on the image | |
extracted_text = pytesseract.image_to_string(img, lang="fas") # 'fas' for Persian | |
# Check if target text is in the extracted text | |
if len(extracted_text.strip())>0: | |
print('----------------------') | |
print(f"link : https://www.digikala.com/product/dkp-{product_id}/") | |
print(f"{product_id} : {extracted_text.strip()}") | |
target_text = "شکار گنج" | |
traget_text_2 = "شکارگنج" | |
if (target_text in extracted_text) or (traget_text_2 in extracted_text): | |
print(f"Text found Text found Text found : {target_text}") | |
print(f"Text found Text found link : https://www.digikala.com/product/dkp-{product_id}/") | |
print('----------*************************------------') | |
return True | |
# else: | |
# print("Text not found.") | |
# return False | |
except Exception as e: | |
print(f"Error processing image: {e}") | |
return False | |
def copy_file(source_path, destination_dir): | |
"""Copy a file to another directory.""" | |
try: | |
# Copy the file to the destination directory | |
shutil.copy(source_path, destination_dir) | |
print(f"File copied from {source_path} to {destination_dir}") | |
except Exception as e: | |
print(f"Error copying file: {e}") | |
def fetch_products(page_url): | |
"""Fetch product data from the category page.""" | |
try: | |
response = requests.get(page_url) | |
if response.status_code == 200: | |
return response.json() | |
else: | |
# print(f"Failed to fetch products: {response.status_code}") | |
return None | |
except Exception as e: | |
# print(f"Error fetching products: {e}") | |
return None | |
def fetch_product_details(product_id): | |
"""Fetch product details for a specific product.""" | |
product_url = f"https://api.digikala.com/v2/product/{product_id}/" | |
try: | |
response = requests.get(product_url) | |
if response.status_code == 200: | |
return response.json() | |
else: | |
# print(f"Failed to fetch product {product_id}: {response.status_code}") | |
return None | |
except Exception as e: | |
# print(f"Error fetching product {product_id}: {e}") | |
return None | |
def download_image(image_url, product_id, folder="digikala_images"): | |
"""Download an image and save it locally.""" | |
try: | |
response = requests.get(image_url, stream=True) | |
if response.status_code == 200: | |
random_number = random.randint(0, 10000) | |
image_name = f'{product_id}_{str(random_number)}.jpeg' | |
image_path = os.path.join(folder, image_name) | |
with open(image_path, "wb") as f: | |
for chunk in response.iter_content(1024): | |
f.write(chunk) | |
# print(f"Downloaded: {image_url}") | |
finded = find_text_in_image(image_path=image_path,product_id=product_id) | |
if finded: | |
dest_dir = "/home/user1/Documents/mci-project/seo-gateway/digi/digikala_images/final" | |
copy_file(source_path=image_path ,destination_dir=dest_dir) | |
else: | |
pass | |
# print(f"Failed to download: {image_url}") | |
except Exception as e: | |
pass | |
# print(f"Error downloading {image_url}: {e}") | |
def process_product(product_id): | |
"""Fetch product details and download its images.""" | |
product_data = fetch_product_details(product_id) | |
if not product_data: | |
return | |
# Extract image URLs | |
images = product_data.get("data", {}).get("product", {}).get("images", {}).get('list', []) | |
if len(images)>=1: | |
# image = images[-1] | |
for image in images: | |
image_url = image.get("url")[0] | |
if image_url: | |
download_image(image_url = image_url, product_id=product_id) | |
else: | |
print(f"product_id : ::::: {product_id}") | |
def main(): | |
max_page = 200 | |
for page in range(max_page): | |
print(page) | |
# Category URL to fetch products (update the page number as needed) | |
category_url = f"https://api.digikala.com/v1/categories/rural-products/search/?page={page}&sort=7&th_no_track=1" | |
print(category_url) | |
# Fetch products from the category page | |
category_data = fetch_products(category_url) | |
if not category_data: | |
return | |
# Extract product IDs | |
products = category_data.get("data", {}).get("products", []) | |
product_ids = [product.get("id") for product in products if "id" in product] | |
# print(product_ids) | |
# Use multiprocessing to process each product | |
with Pool(cpu_count()) as pool: | |
pool.map(process_product, product_ids) | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from PIL import Image | |
import pytesseract | |
def find_text_in_image(image_path, product_id, target_text=''): | |
"""Check if the target text exists in the image.""" | |
try: | |
# Open the image | |
img = Image.open(image_path) | |
# Perform OCR on the image | |
extracted_text = pytesseract.image_to_string(img, lang="fas") # 'fas' for Persian | |
# Check if target text is in the extracted text | |
print('----------------------') | |
print(f"{product_id} : {extracted_text}") | |
print('----------------------') | |
if target_text in extracted_text: | |
print(f"Text found: {target_text}") | |
return True | |
else: | |
print("Text not found.") | |
return False | |
except Exception as e: | |
print(f"Error processing image: {e}") | |
return False | |
# Test the function | |
image_path = "/home/user1/Documents/project/digi/digikala_images/sample.jpeg" # Replace with your image file path | |
target_text = "شکار گنج" | |
find_text_in_image(image_path, target_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment