Created
June 3, 2024 16:14
-
-
Save fcavalcantirj/f17883c5852d4da2d6ea608db20cd300 to your computer and use it in GitHub Desktop.
chatwith.io crawler - using waatMessenger to report
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
import selenium.common.exceptions | |
import time | |
import datetime | |
import requests | |
import logging | |
import pymongo | |
from pymongo import MongoClient | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Set up the webdriver | |
options = webdriver.ChromeOptions() | |
options.add_argument('--headless') | |
options.add_argument('--disable-gpu') | |
options.add_argument('--no-sandbox') | |
driver = webdriver.Chrome(options=options) | |
start_time = datetime.datetime.now() | |
# Database connection | |
client = MongoClient('DB_URL') ########## update here | |
db = client['waatmessenger-db'] | |
collection = db['all_numbers'] | |
# List of base URLs | |
base_urls = [ | |
# "https://chatwith.io/instagram/landings/afghanistan/all/all/", | |
# "https://chatwith.io/instagram/landings/albania/all/all/", | |
# "https://chatwith.io/instagram/landings/algeria/all/all/", | |
# "https://chatwith.io/instagram/landings/andorra/all/all/", | |
# "https://chatwith.io/instagram/landings/angola/all/all/", | |
# "https://chatwith.io/instagram/landings/antigua-and-barbuda/all/all/", | |
# "https://chatwith.io/instagram/landings/argentina/all/all/", | |
# "https://chatwith.io/instagram/landings/armenia/all/all/", | |
# "https://chatwith.io/instagram/landings/australia/all/all/", | |
# "https://chatwith.io/instagram/landings/austria/all/all/", | |
# "https://chatwith.io/instagram/landings/azerbaijan/all/all/", | |
# "https://chatwith.io/instagram/landings/bahamas/all/all/", | |
# "https://chatwith.io/instagram/landings/bahrain/all/all/", | |
# "https://chatwith.io/instagram/landings/bangladesh/all/all/", | |
# "https://chatwith.io/instagram/landings/barbados/all/all/", | |
# "https://chatwith.io/instagram/landings/belarus/all/all/", | |
# "https://chatwith.io/instagram/landings/belgium/all/all/", | |
# "https://chatwith.io/instagram/landings/belize/all/all/", | |
# "https://chatwith.io/instagram/landings/benin/all/all/", | |
# "https://chatwith.io/instagram/landings/bhutan/all/all/", | |
# "https://chatwith.io/instagram/landings/bolivia/all/all/", | |
# "https://chatwith.io/instagram/landings/bosnia-and-herzegovina/all/all/", | |
# "https://chatwith.io/instagram/landings/botswana/all/all/", | |
# "https://chatwith.io/instagram/landings/brazil/all/all/", | |
# "https://chatwith.io/instagram/landings/brunei/all/all/", | |
# "https://chatwith.io/instagram/landings/bulgaria/all/all/", | |
# "https://chatwith.io/instagram/landings/burkina-faso/all/all/", | |
# "https://chatwith.io/instagram/landings/burundi/all/all/", | |
# "https://chatwith.io/instagram/landings/cambodia/all/all/", | |
# "https://chatwith.io/instagram/landings/cameroon/all/all/", | |
# "https://chatwith.io/instagram/landings/canada/all/all/", | |
# "https://chatwith.io/instagram/landings/central-african-republic/all/all/", | |
# "https://chatwith.io/instagram/landings/chad/all/all/", | |
# "https://chatwith.io/instagram/landings/chile/all/all/", | |
# "https://chatwith.io/instagram/landings/china/all/all/", | |
# "https://chatwith.io/instagram/landings/colombia/all/all/", | |
# "https://chatwith.io/instagram/landings/comoros/all/all/", | |
# "https://chatwith.io/instagram/landings/costa-rica/all/all/", | |
# "https://chatwith.io/instagram/landings/croatia/all/all/", | |
# "https://chatwith.io/instagram/landings/cuba/all/all/", | |
# "https://chatwith.io/instagram/landings/cyprus/all/all/", | |
# "https://chatwith.io/instagram/landings/denmark/all/all/", | |
# "https://chatwith.io/instagram/landings/dominican-republic/all/all/", | |
# "https://chatwith.io/instagram/landings/ecuador/all/all/", | |
# "https://chatwith.io/instagram/landings/egypt/all/all/", | |
# "https://chatwith.io/instagram/landings/el-salvador/all/all/", | |
# "https://chatwith.io/instagram/landings/equatorial-guinea/all/all/", | |
# "https://chatwith.io/instagram/landings/eritrea/all/all/", | |
# "https://chatwith.io/instagram/landings/estonia/all/all/", | |
# "https://chatwith.io/instagram/landings/ethiopia/all/all/", | |
# "https://chatwith.io/instagram/landings/fiji/all/all/", | |
# "https://chatwith.io/instagram/landings/finland/all/all/", | |
# "https://chatwith.io/instagram/landings/france/all/all/", | |
# "https://chatwith.io/instagram/landings/gabon/all/all/", | |
# "https://chatwith.io/instagram/landings/gambia/all/all/", | |
# "https://chatwith.io/instagram/landings/georgia/all/all/", | |
# "https://chatwith.io/instagram/landings/germany/all/all/", | |
# "https://chatwith.io/instagram/landings/ghana/all/all/", | |
# "https://chatwith.io/instagram/landings/greece/all/all/", | |
# "https://chatwith.io/instagram/landings/guatemala/all/all/", | |
# "https://chatwith.io/instagram/landings/guyana/all/all/", | |
# "https://chatwith.io/instagram/landings/haiti/all/all/", | |
# "https://chatwith.io/instagram/landings/honduras/all/all/", | |
# "https://chatwith.io/instagram/landings/hungary/all/all/", | |
# "https://chatwith.io/instagram/landings/iceland/all/all/", | |
# "https://chatwith.io/instagram/landings/india/all/all/", | |
# "https://chatwith.io/instagram/landings/indonesia/all/all/", | |
# "https://chatwith.io/instagram/landings/iran/all/all/", | |
# "https://chatwith.io/instagram/landings/iraq/all/all/", | |
# "https://chatwith.io/instagram/landings/ireland/all/all/", | |
# "https://chatwith.io/instagram/landings/israel/all/all/", | |
# "https://chatwith.io/instagram/landings/italy/all/all/", | |
# "https://chatwith.io/instagram/landings/jamaica/all/all/", | |
# "https://chatwith.io/instagram/landings/japan/all/all/", | |
# "https://chatwith.io/instagram/landings/jordan/all/all/", | |
# "https://chatwith.io/instagram/landings/kazakhstan/all/all/", | |
# "https://chatwith.io/instagram/landings/kenya/all/all/", | |
# "https://chatwith.io/instagram/landings/kuwait/all/all/", | |
# "https://chatwith.io/instagram/landings/laos/all/all/", | |
# "https://chatwith.io/instagram/landings/latvia/all/all/", | |
# "https://chatwith.io/instagram/landings/lebanon/all/all/", | |
# "https://chatwith.io/instagram/landings/liberia/all/all/", | |
# "https://chatwith.io/instagram/landings/libya/all/all/", | |
# "https://chatwith.io/instagram/landings/liechtenstein/all/all/", | |
# "https://chatwith.io/instagram/landings/lithuania/all/all/", | |
# "https://chatwith.io/instagram/landings/luxembourg/all/all/", | |
# "https://chatwith.io/instagram/landings/madagascar/all/all/", | |
# "https://chatwith.io/instagram/landings/malaysia/all/all/", | |
# "https://chatwith.io/instagram/landings/mali/all/all/", | |
# "https://chatwith.io/instagram/landings/malta/all/all/", | |
# "https://chatwith.io/instagram/landings/mauritania/all/all/", | |
# "https://chatwith.io/instagram/landings/mauritius/all/all/", | |
# "https://chatwith.io/instagram/landings/mexico/all/all/", | |
# "https://chatwith.io/instagram/landings/moldova/all/all/", | |
# "https://chatwith.io/instagram/landings/monaco/all/all/", | |
# "https://chatwith.io/instagram/landings/mongolia/all/all/", | |
# "https://chatwith.io/instagram/landings/montenegro/all/all/", | |
# "https://chatwith.io/instagram/landings/morocco/all/all/", | |
# "https://chatwith.io/instagram/landings/mozambique/all/all/", | |
# "https://chatwith.io/instagram/landings/myanmar/all/all/", | |
# "https://chatwith.io/instagram/landings/namibia/all/all/", | |
# "https://chatwith.io/instagram/landings/nauru/all/all/", | |
# "https://chatwith.io/instagram/landings/nepal/all/all/", | |
# "https://chatwith.io/instagram/landings/netherlands/all/all/", | |
# "https://chatwith.io/instagram/landings/new-zealand/all/all/", | |
# "https://chatwith.io/instagram/landings/nicaragua/all/all/", | |
# "https://chatwith.io/instagram/landings/niger/all/all/", | |
# "https://chatwith.io/instagram/landings/nigeria/all/all/", | |
# "https://chatwith.io/instagram/landings/north-korea/all/all/", | |
# "https://chatwith.io/instagram/landings/north-macedonia/all/all/", | |
# "https://chatwith.io/instagram/landings/norway/all/all/", | |
# "https://chatwith.io/instagram/landings/oman/all/all/", | |
# "https://chatwith.io/instagram/landings/pakistan/all/all/", | |
# "https://chatwith.io/instagram/landings/palau/all/all/", | |
# "https://chatwith.io/instagram/landings/palestine/all/all/", | |
# "https://chatwith.io/instagram/landings/panama/all/all/", | |
# "https://chatwith.io/instagram/landings/papua-new-guinea/all/all/", | |
# "https://chatwith.io/instagram/landings/paraguay/all/all/", | |
# "https://chatwith.io/instagram/landings/peru/all/all/", | |
# "https://chatwith.io/instagram/landings/philippines/all/all/", | |
# "https://chatwith.io/instagram/landings/poland/all/all/", | |
# "https://chatwith.io/instagram/landings/portugal/all/all/", | |
# "https://chatwith.io/instagram/landings/qatar/all/all/", | |
# "https://chatwith.io/instagram/landings/romania/all/all/", | |
# "https://chatwith.io/instagram/landings/russia/all/all/", | |
# "https://chatwith.io/instagram/landings/rwanda/all/all/", | |
# "https://chatwith.io/instagram/landings/samoa/all/all/", | |
# "https://chatwith.io/instagram/landings/san-marino/all/all/", | |
# "https://chatwith.io/instagram/landings/saudi-arabia/all/all/", | |
# "https://chatwith.io/instagram/landings/senegal/all/all/", | |
# "https://chatwith.io/instagram/landings/serbia/all/all/", | |
# "https://chatwith.io/instagram/landings/seychelles/all/all/", | |
# "https://chatwith.io/instagram/landings/sierra-leone/all/all/", | |
# "https://chatwith.io/instagram/landings/singapore/all/all/", | |
# "https://chatwith.io/instagram/landings/slovakia/all/all/", | |
# "https://chatwith.io/instagram/landings/slovenia/all/all/", | |
# "https://chatwith.io/instagram/landings/solomon-islands/all/all/", | |
# "https://chatwith.io/instagram/landings/somalia/all/all/", | |
# "https://chatwith.io/instagram/landings/south-africa/all/all/", | |
# "https://chatwith.io/instagram/landings/south-korea/all/all/", | |
# "https://chatwith.io/instagram/landings/spain/all/all/", | |
# "https://chatwith.io/instagram/landings/sri-lanka/all/all/", | |
# "https://chatwith.io/instagram/landings/sudan/all/all/", | |
# "https://chatwith.io/instagram/landings/sweden/all/all/", | |
# "https://chatwith.io/instagram/landings/switzerland/all/all/", | |
# "https://chatwith.io/instagram/landings/syria/all/all/", | |
# "https://chatwith.io/instagram/landings/taiwan/all/all/", | |
# "https://chatwith.io/instagram/landings/tajikistan/all/all/", | |
# "https://chatwith.io/instagram/landings/tanzania/all/all/", | |
# "https://chatwith.io/instagram/landings/thailand/all/all/", | |
"https://chatwith.io/instagram/landings/togo/all/all/", | |
"https://chatwith.io/instagram/landings/tonga/all/all/", | |
"https://chatwith.io/instagram/landings/trinidad-and-tobago/all/all/", | |
"https://chatwith.io/instagram/landings/tunisia/all/all/", | |
"https://chatwith.io/instagram/landings/turkey/all/all/", | |
"https://chatwith.io/instagram/landings/turkmenistan/all/all/", | |
"https://chatwith.io/instagram/landings/tuvalu/all/all/", | |
"https://chatwith.io/instagram/landings/uganda/all/all/", | |
"https://chatwith.io/instagram/landings/ukraine/all/all/", | |
"https://chatwith.io/instagram/landings/united-arab-emirates/all/all/", | |
"https://chatwith.io/instagram/landings/united-kingdom/all/all/", | |
"https://chatwith.io/instagram/landings/united-states/all/all/", | |
"https://chatwith.io/instagram/landings/uruguay/all/all/", | |
"https://chatwith.io/instagram/landings/uzbekistan/all/all/", | |
"https://chatwith.io/instagram/landings/vanuatu/all/all/", | |
"https://chatwith.io/instagram/landings/vatican-city/all/all/", | |
"https://chatwith.io/instagram/landings/venezuela/all/all/", | |
"https://chatwith.io/instagram/landings/vietnam/all/all/", | |
"https://chatwith.io/instagram/landings/yemen/all/all/", | |
"https://chatwith.io/instagram/landings/zambia/all/all/", | |
"https://chatwith.io/instagram/landings/zimbabwe/all/all/", | |
] | |
# Start the timer | |
start_time = datetime.datetime.now() | |
# Function to generate summary message | |
def generate_summary_message(url, time_taken, total_numbers): | |
return ( | |
f"📪 *{url}* 📪\n\n" | |
f"Total numbers found: {total_numbers}\n" | |
f"Time taken: {time_taken}" | |
) | |
def process_page(url, seen_records): | |
try: | |
driver.get(url) | |
time.sleep(2) # Wait for page to load | |
# Find all the desired links and add their hrefs to the set | |
links = driver.find_elements(By.XPATH, '//a[contains(@href, "/s/")]') | |
hrefs = set() | |
for link in links: | |
hrefs.add(link.get_attribute('href')) | |
# Clean and process the extracted hrefs | |
cleaned_hrefs = [href.split('/')[-1] for href in hrefs] | |
# Detect if the records are repeated | |
if seen_records.issuperset(cleaned_hrefs): | |
return 0, seen_records # No new unique records found, stop processing | |
# Update seen records | |
seen_records.update(cleaned_hrefs) | |
# Insert numbers into the database | |
for href in cleaned_hrefs: | |
participant = href + "@s.whatsapp.net" | |
data = { | |
"participant": participant | |
} | |
try: | |
collection.insert_one(data) | |
logger.info(f"Inserted {participant}") | |
except pymongo.errors.DuplicateKeyError: | |
logger.warning(f"Duplicate entry: {participant}") | |
except Exception as e: | |
logger.error(f"Error inserting {participant}: {e}") | |
return len(cleaned_hrefs), seen_records | |
except selenium.common.exceptions.NoSuchElementException as e: | |
logger.error(f"Error processing page {url}: Element not found - {e}") | |
except Exception as e: | |
logger.error(f"Unexpected error processing page {url}: {e}") | |
return 0, seen_records | |
# Main script to loop through URLs and their pages and process them | |
total_numbers_found = 0 | |
for base_url in base_urls: | |
page_number = 0 | |
seen_records = set() | |
if "brazil" in base_url: | |
page_number = 634 | |
if "malaysia" in base_url: | |
page_number = 1470 | |
url_start_time = datetime.datetime.now() | |
while True: | |
url = base_url + "?page=" + str(page_number) | |
numbers_found, seen_records = process_page(url, seen_records) | |
if numbers_found == 0: | |
logger.info(f"No more pages to process for {base_url}. Stopping at page {page_number}.") | |
break | |
total_numbers_found += numbers_found | |
logger.info(f"Processed page {page_number} of {base_url}, found {numbers_found} numbers.") | |
page_number += 1 | |
# Calculate and print the time taken for this URL | |
url_end_time = datetime.datetime.now() | |
url_time_taken = url_end_time - url_start_time | |
logger.info(f"Time taken for {base_url}: {url_time_taken}") | |
print(f"Time taken for {base_url}: {url_time_taken}") | |
# Send summary message for this URL | |
url_summary = generate_summary_message(base_url, url_time_taken, len(seen_records)) | |
payload = { | |
"numbers": "YOUR_NUMBER", ########## update here | |
"message": url_summary | |
} | |
logger.info(f"Summary message body for {base_url}: {url_summary}") | |
########## update token bellow - use https://browbot-waatmessenger-html.pages.dev/ and scan qrCode with whatsApp | |
try: | |
response = requests.post("https://api.waatmessenger.com.br/message/YOUR_TOKEN", json=payload, headers={'Content-Type': 'application/json'}) | |
response.raise_for_status() | |
except requests.exceptions.RequestException as e: | |
logger.error(f"Failed to send summary message for {base_url}: {e}") | |
# Calculate and print the total time taken | |
end_time = datetime.datetime.now() | |
time_taken = end_time - start_time | |
logger.info(f"Total time taken: {time_taken}") | |
print(f"Total time taken: {time_taken}") | |
# Close the driver | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment