Skip to content

Instantly share code, notes, and snippets.

@mhsharifi96
Created November 24, 2024 07:20
Show Gist options
  • Save mhsharifi96/c1303a1ed6ae51e2579c196f247438cc to your computer and use it in GitHub Desktop.
Save mhsharifi96/c1303a1ed6ae51e2579c196f247438cc to your computer and use it in GitHub Desktop.
""" این کد جهت یافتن شکار گنج دیجی کالاست"""
import os
import requests
from multiprocessing import Pool, cpu_count
import random
# Create a folder to save images
os.makedirs("digikala_images", exist_ok=True)
os.makedirs("digikala_images/final", exist_ok=True)
import shutil
from PIL import Image
import pytesseract
def find_text_in_image(image_path, product_id, target_text=''):
"""Check if the target text exists in the image."""
try:
# Open the image
img = Image.open(image_path)
# Perform OCR on the image
extracted_text = pytesseract.image_to_string(img, lang="fas") # 'fas' for Persian
# Check if target text is in the extracted text
if len(extracted_text.strip())>0:
print('----------------------')
print(f"link : https://www.digikala.com/product/dkp-{product_id}/")
print(f"{product_id} : {extracted_text.strip()}")
target_text = "شکار گنج"
traget_text_2 = "شکارگنج"
if (target_text in extracted_text) or (traget_text_2 in extracted_text):
print(f"Text found Text found Text found : {target_text}")
print(f"Text found Text found link : https://www.digikala.com/product/dkp-{product_id}/")
print('----------*************************------------')
return True
# else:
# print("Text not found.")
# return False
except Exception as e:
print(f"Error processing image: {e}")
return False
def copy_file(source_path, destination_dir):
"""Copy a file to another directory."""
try:
# Copy the file to the destination directory
shutil.copy(source_path, destination_dir)
print(f"File copied from {source_path} to {destination_dir}")
except Exception as e:
print(f"Error copying file: {e}")
def fetch_products(page_url):
"""Fetch product data from the category page."""
try:
response = requests.get(page_url)
if response.status_code == 200:
return response.json()
else:
# print(f"Failed to fetch products: {response.status_code}")
return None
except Exception as e:
# print(f"Error fetching products: {e}")
return None
def fetch_product_details(product_id):
"""Fetch product details for a specific product."""
product_url = f"https://api.digikala.com/v2/product/{product_id}/"
try:
response = requests.get(product_url)
if response.status_code == 200:
return response.json()
else:
# print(f"Failed to fetch product {product_id}: {response.status_code}")
return None
except Exception as e:
# print(f"Error fetching product {product_id}: {e}")
return None
def download_image(image_url, product_id, folder="digikala_images"):
"""Download an image and save it locally."""
try:
response = requests.get(image_url, stream=True)
if response.status_code == 200:
random_number = random.randint(0, 10000)
image_name = f'{product_id}_{str(random_number)}.jpeg'
image_path = os.path.join(folder, image_name)
with open(image_path, "wb") as f:
for chunk in response.iter_content(1024):
f.write(chunk)
# print(f"Downloaded: {image_url}")
finded = find_text_in_image(image_path=image_path,product_id=product_id)
if finded:
dest_dir = "/home/user1/Documents/mci-project/seo-gateway/digi/digikala_images/final"
copy_file(source_path=image_path ,destination_dir=dest_dir)
else:
pass
# print(f"Failed to download: {image_url}")
except Exception as e:
pass
# print(f"Error downloading {image_url}: {e}")
def process_product(product_id):
"""Fetch product details and download its images."""
product_data = fetch_product_details(product_id)
if not product_data:
return
# Extract image URLs
images = product_data.get("data", {}).get("product", {}).get("images", {}).get('list', [])
if len(images)>=1:
# image = images[-1]
for image in images:
image_url = image.get("url")[0]
if image_url:
download_image(image_url = image_url, product_id=product_id)
else:
print(f"product_id : ::::: {product_id}")
def main():
max_page = 200
for page in range(max_page):
print(page)
# Category URL to fetch products (update the page number as needed)
category_url = f"https://api.digikala.com/v1/categories/rural-products/search/?page={page}&sort=7&th_no_track=1"
print(category_url)
# Fetch products from the category page
category_data = fetch_products(category_url)
if not category_data:
return
# Extract product IDs
products = category_data.get("data", {}).get("products", [])
product_ids = [product.get("id") for product in products if "id" in product]
# print(product_ids)
# Use multiprocessing to process each product
with Pool(cpu_count()) as pool:
pool.map(process_product, product_ids)
if __name__ == "__main__":
main()
from PIL import Image
import pytesseract
def find_text_in_image(image_path, product_id, target_text=''):
"""Check if the target text exists in the image."""
try:
# Open the image
img = Image.open(image_path)
# Perform OCR on the image
extracted_text = pytesseract.image_to_string(img, lang="fas") # 'fas' for Persian
# Check if target text is in the extracted text
print('----------------------')
print(f"{product_id} : {extracted_text}")
print('----------------------')
if target_text in extracted_text:
print(f"Text found: {target_text}")
return True
else:
print("Text not found.")
return False
except Exception as e:
print(f"Error processing image: {e}")
return False
# Test the function
image_path = "/home/user1/Documents/project/digi/digikala_images/sample.jpeg" # Replace with your image file path
target_text = "شکار گنج"
find_text_in_image(image_path, target_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment