Skip to content

Instantly share code, notes, and snippets.

@fcavalcantirj
Created June 3, 2024 16:14
Show Gist options
  • Save fcavalcantirj/f17883c5852d4da2d6ea608db20cd300 to your computer and use it in GitHub Desktop.
Save fcavalcantirj/f17883c5852d4da2d6ea608db20cd300 to your computer and use it in GitHub Desktop.
chatwith.io crawler - using waatMessenger to report
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.common.exceptions
import time
import datetime
import requests
import logging
import pymongo
from pymongo import MongoClient
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Set up the webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)
start_time = datetime.datetime.now()
# Database connection
client = MongoClient('DB_URL') ########## update here
db = client['waatmessenger-db']
collection = db['all_numbers']
# List of base URLs
base_urls = [
# "https://chatwith.io/instagram/landings/afghanistan/all/all/",
# "https://chatwith.io/instagram/landings/albania/all/all/",
# "https://chatwith.io/instagram/landings/algeria/all/all/",
# "https://chatwith.io/instagram/landings/andorra/all/all/",
# "https://chatwith.io/instagram/landings/angola/all/all/",
# "https://chatwith.io/instagram/landings/antigua-and-barbuda/all/all/",
# "https://chatwith.io/instagram/landings/argentina/all/all/",
# "https://chatwith.io/instagram/landings/armenia/all/all/",
# "https://chatwith.io/instagram/landings/australia/all/all/",
# "https://chatwith.io/instagram/landings/austria/all/all/",
# "https://chatwith.io/instagram/landings/azerbaijan/all/all/",
# "https://chatwith.io/instagram/landings/bahamas/all/all/",
# "https://chatwith.io/instagram/landings/bahrain/all/all/",
# "https://chatwith.io/instagram/landings/bangladesh/all/all/",
# "https://chatwith.io/instagram/landings/barbados/all/all/",
# "https://chatwith.io/instagram/landings/belarus/all/all/",
# "https://chatwith.io/instagram/landings/belgium/all/all/",
# "https://chatwith.io/instagram/landings/belize/all/all/",
# "https://chatwith.io/instagram/landings/benin/all/all/",
# "https://chatwith.io/instagram/landings/bhutan/all/all/",
# "https://chatwith.io/instagram/landings/bolivia/all/all/",
# "https://chatwith.io/instagram/landings/bosnia-and-herzegovina/all/all/",
# "https://chatwith.io/instagram/landings/botswana/all/all/",
# "https://chatwith.io/instagram/landings/brazil/all/all/",
# "https://chatwith.io/instagram/landings/brunei/all/all/",
# "https://chatwith.io/instagram/landings/bulgaria/all/all/",
# "https://chatwith.io/instagram/landings/burkina-faso/all/all/",
# "https://chatwith.io/instagram/landings/burundi/all/all/",
# "https://chatwith.io/instagram/landings/cambodia/all/all/",
# "https://chatwith.io/instagram/landings/cameroon/all/all/",
# "https://chatwith.io/instagram/landings/canada/all/all/",
# "https://chatwith.io/instagram/landings/central-african-republic/all/all/",
# "https://chatwith.io/instagram/landings/chad/all/all/",
# "https://chatwith.io/instagram/landings/chile/all/all/",
# "https://chatwith.io/instagram/landings/china/all/all/",
# "https://chatwith.io/instagram/landings/colombia/all/all/",
# "https://chatwith.io/instagram/landings/comoros/all/all/",
# "https://chatwith.io/instagram/landings/costa-rica/all/all/",
# "https://chatwith.io/instagram/landings/croatia/all/all/",
# "https://chatwith.io/instagram/landings/cuba/all/all/",
# "https://chatwith.io/instagram/landings/cyprus/all/all/",
# "https://chatwith.io/instagram/landings/denmark/all/all/",
# "https://chatwith.io/instagram/landings/dominican-republic/all/all/",
# "https://chatwith.io/instagram/landings/ecuador/all/all/",
# "https://chatwith.io/instagram/landings/egypt/all/all/",
# "https://chatwith.io/instagram/landings/el-salvador/all/all/",
# "https://chatwith.io/instagram/landings/equatorial-guinea/all/all/",
# "https://chatwith.io/instagram/landings/eritrea/all/all/",
# "https://chatwith.io/instagram/landings/estonia/all/all/",
# "https://chatwith.io/instagram/landings/ethiopia/all/all/",
# "https://chatwith.io/instagram/landings/fiji/all/all/",
# "https://chatwith.io/instagram/landings/finland/all/all/",
# "https://chatwith.io/instagram/landings/france/all/all/",
# "https://chatwith.io/instagram/landings/gabon/all/all/",
# "https://chatwith.io/instagram/landings/gambia/all/all/",
# "https://chatwith.io/instagram/landings/georgia/all/all/",
# "https://chatwith.io/instagram/landings/germany/all/all/",
# "https://chatwith.io/instagram/landings/ghana/all/all/",
# "https://chatwith.io/instagram/landings/greece/all/all/",
# "https://chatwith.io/instagram/landings/guatemala/all/all/",
# "https://chatwith.io/instagram/landings/guyana/all/all/",
# "https://chatwith.io/instagram/landings/haiti/all/all/",
# "https://chatwith.io/instagram/landings/honduras/all/all/",
# "https://chatwith.io/instagram/landings/hungary/all/all/",
# "https://chatwith.io/instagram/landings/iceland/all/all/",
# "https://chatwith.io/instagram/landings/india/all/all/",
# "https://chatwith.io/instagram/landings/indonesia/all/all/",
# "https://chatwith.io/instagram/landings/iran/all/all/",
# "https://chatwith.io/instagram/landings/iraq/all/all/",
# "https://chatwith.io/instagram/landings/ireland/all/all/",
# "https://chatwith.io/instagram/landings/israel/all/all/",
# "https://chatwith.io/instagram/landings/italy/all/all/",
# "https://chatwith.io/instagram/landings/jamaica/all/all/",
# "https://chatwith.io/instagram/landings/japan/all/all/",
# "https://chatwith.io/instagram/landings/jordan/all/all/",
# "https://chatwith.io/instagram/landings/kazakhstan/all/all/",
# "https://chatwith.io/instagram/landings/kenya/all/all/",
# "https://chatwith.io/instagram/landings/kuwait/all/all/",
# "https://chatwith.io/instagram/landings/laos/all/all/",
# "https://chatwith.io/instagram/landings/latvia/all/all/",
# "https://chatwith.io/instagram/landings/lebanon/all/all/",
# "https://chatwith.io/instagram/landings/liberia/all/all/",
# "https://chatwith.io/instagram/landings/libya/all/all/",
# "https://chatwith.io/instagram/landings/liechtenstein/all/all/",
# "https://chatwith.io/instagram/landings/lithuania/all/all/",
# "https://chatwith.io/instagram/landings/luxembourg/all/all/",
# "https://chatwith.io/instagram/landings/madagascar/all/all/",
# "https://chatwith.io/instagram/landings/malaysia/all/all/",
# "https://chatwith.io/instagram/landings/mali/all/all/",
# "https://chatwith.io/instagram/landings/malta/all/all/",
# "https://chatwith.io/instagram/landings/mauritania/all/all/",
# "https://chatwith.io/instagram/landings/mauritius/all/all/",
# "https://chatwith.io/instagram/landings/mexico/all/all/",
# "https://chatwith.io/instagram/landings/moldova/all/all/",
# "https://chatwith.io/instagram/landings/monaco/all/all/",
# "https://chatwith.io/instagram/landings/mongolia/all/all/",
# "https://chatwith.io/instagram/landings/montenegro/all/all/",
# "https://chatwith.io/instagram/landings/morocco/all/all/",
# "https://chatwith.io/instagram/landings/mozambique/all/all/",
# "https://chatwith.io/instagram/landings/myanmar/all/all/",
# "https://chatwith.io/instagram/landings/namibia/all/all/",
# "https://chatwith.io/instagram/landings/nauru/all/all/",
# "https://chatwith.io/instagram/landings/nepal/all/all/",
# "https://chatwith.io/instagram/landings/netherlands/all/all/",
# "https://chatwith.io/instagram/landings/new-zealand/all/all/",
# "https://chatwith.io/instagram/landings/nicaragua/all/all/",
# "https://chatwith.io/instagram/landings/niger/all/all/",
# "https://chatwith.io/instagram/landings/nigeria/all/all/",
# "https://chatwith.io/instagram/landings/north-korea/all/all/",
# "https://chatwith.io/instagram/landings/north-macedonia/all/all/",
# "https://chatwith.io/instagram/landings/norway/all/all/",
# "https://chatwith.io/instagram/landings/oman/all/all/",
# "https://chatwith.io/instagram/landings/pakistan/all/all/",
# "https://chatwith.io/instagram/landings/palau/all/all/",
# "https://chatwith.io/instagram/landings/palestine/all/all/",
# "https://chatwith.io/instagram/landings/panama/all/all/",
# "https://chatwith.io/instagram/landings/papua-new-guinea/all/all/",
# "https://chatwith.io/instagram/landings/paraguay/all/all/",
# "https://chatwith.io/instagram/landings/peru/all/all/",
# "https://chatwith.io/instagram/landings/philippines/all/all/",
# "https://chatwith.io/instagram/landings/poland/all/all/",
# "https://chatwith.io/instagram/landings/portugal/all/all/",
# "https://chatwith.io/instagram/landings/qatar/all/all/",
# "https://chatwith.io/instagram/landings/romania/all/all/",
# "https://chatwith.io/instagram/landings/russia/all/all/",
# "https://chatwith.io/instagram/landings/rwanda/all/all/",
# "https://chatwith.io/instagram/landings/samoa/all/all/",
# "https://chatwith.io/instagram/landings/san-marino/all/all/",
# "https://chatwith.io/instagram/landings/saudi-arabia/all/all/",
# "https://chatwith.io/instagram/landings/senegal/all/all/",
# "https://chatwith.io/instagram/landings/serbia/all/all/",
# "https://chatwith.io/instagram/landings/seychelles/all/all/",
# "https://chatwith.io/instagram/landings/sierra-leone/all/all/",
# "https://chatwith.io/instagram/landings/singapore/all/all/",
# "https://chatwith.io/instagram/landings/slovakia/all/all/",
# "https://chatwith.io/instagram/landings/slovenia/all/all/",
# "https://chatwith.io/instagram/landings/solomon-islands/all/all/",
# "https://chatwith.io/instagram/landings/somalia/all/all/",
# "https://chatwith.io/instagram/landings/south-africa/all/all/",
# "https://chatwith.io/instagram/landings/south-korea/all/all/",
# "https://chatwith.io/instagram/landings/spain/all/all/",
# "https://chatwith.io/instagram/landings/sri-lanka/all/all/",
# "https://chatwith.io/instagram/landings/sudan/all/all/",
# "https://chatwith.io/instagram/landings/sweden/all/all/",
# "https://chatwith.io/instagram/landings/switzerland/all/all/",
# "https://chatwith.io/instagram/landings/syria/all/all/",
# "https://chatwith.io/instagram/landings/taiwan/all/all/",
# "https://chatwith.io/instagram/landings/tajikistan/all/all/",
# "https://chatwith.io/instagram/landings/tanzania/all/all/",
# "https://chatwith.io/instagram/landings/thailand/all/all/",
"https://chatwith.io/instagram/landings/togo/all/all/",
"https://chatwith.io/instagram/landings/tonga/all/all/",
"https://chatwith.io/instagram/landings/trinidad-and-tobago/all/all/",
"https://chatwith.io/instagram/landings/tunisia/all/all/",
"https://chatwith.io/instagram/landings/turkey/all/all/",
"https://chatwith.io/instagram/landings/turkmenistan/all/all/",
"https://chatwith.io/instagram/landings/tuvalu/all/all/",
"https://chatwith.io/instagram/landings/uganda/all/all/",
"https://chatwith.io/instagram/landings/ukraine/all/all/",
"https://chatwith.io/instagram/landings/united-arab-emirates/all/all/",
"https://chatwith.io/instagram/landings/united-kingdom/all/all/",
"https://chatwith.io/instagram/landings/united-states/all/all/",
"https://chatwith.io/instagram/landings/uruguay/all/all/",
"https://chatwith.io/instagram/landings/uzbekistan/all/all/",
"https://chatwith.io/instagram/landings/vanuatu/all/all/",
"https://chatwith.io/instagram/landings/vatican-city/all/all/",
"https://chatwith.io/instagram/landings/venezuela/all/all/",
"https://chatwith.io/instagram/landings/vietnam/all/all/",
"https://chatwith.io/instagram/landings/yemen/all/all/",
"https://chatwith.io/instagram/landings/zambia/all/all/",
"https://chatwith.io/instagram/landings/zimbabwe/all/all/",
]
# Start the timer
start_time = datetime.datetime.now()
# Function to generate summary message
def generate_summary_message(url, time_taken, total_numbers):
return (
f"📪 *{url}* 📪\n\n"
f"Total numbers found: {total_numbers}\n"
f"Time taken: {time_taken}"
)
def process_page(url, seen_records):
try:
driver.get(url)
time.sleep(2) # Wait for page to load
# Find all the desired links and add their hrefs to the set
links = driver.find_elements(By.XPATH, '//a[contains(@href, "/s/")]')
hrefs = set()
for link in links:
hrefs.add(link.get_attribute('href'))
# Clean and process the extracted hrefs
cleaned_hrefs = [href.split('/')[-1] for href in hrefs]
# Detect if the records are repeated
if seen_records.issuperset(cleaned_hrefs):
return 0, seen_records # No new unique records found, stop processing
# Update seen records
seen_records.update(cleaned_hrefs)
# Insert numbers into the database
for href in cleaned_hrefs:
participant = href + "@s.whatsapp.net"
data = {
"participant": participant
}
try:
collection.insert_one(data)
logger.info(f"Inserted {participant}")
except pymongo.errors.DuplicateKeyError:
logger.warning(f"Duplicate entry: {participant}")
except Exception as e:
logger.error(f"Error inserting {participant}: {e}")
return len(cleaned_hrefs), seen_records
except selenium.common.exceptions.NoSuchElementException as e:
logger.error(f"Error processing page {url}: Element not found - {e}")
except Exception as e:
logger.error(f"Unexpected error processing page {url}: {e}")
return 0, seen_records
# Main script to loop through URLs and their pages and process them
total_numbers_found = 0
for base_url in base_urls:
page_number = 0
seen_records = set()
if "brazil" in base_url:
page_number = 634
if "malaysia" in base_url:
page_number = 1470
url_start_time = datetime.datetime.now()
while True:
url = base_url + "?page=" + str(page_number)
numbers_found, seen_records = process_page(url, seen_records)
if numbers_found == 0:
logger.info(f"No more pages to process for {base_url}. Stopping at page {page_number}.")
break
total_numbers_found += numbers_found
logger.info(f"Processed page {page_number} of {base_url}, found {numbers_found} numbers.")
page_number += 1
# Calculate and print the time taken for this URL
url_end_time = datetime.datetime.now()
url_time_taken = url_end_time - url_start_time
logger.info(f"Time taken for {base_url}: {url_time_taken}")
print(f"Time taken for {base_url}: {url_time_taken}")
# Send summary message for this URL
url_summary = generate_summary_message(base_url, url_time_taken, len(seen_records))
payload = {
"numbers": "YOUR_NUMBER", ########## update here
"message": url_summary
}
logger.info(f"Summary message body for {base_url}: {url_summary}")
########## update token bellow - use https://browbot-waatmessenger-html.pages.dev/ and scan qrCode with whatsApp
try:
response = requests.post("https://api.waatmessenger.com.br/message/YOUR_TOKEN", json=payload, headers={'Content-Type': 'application/json'})
response.raise_for_status()
except requests.exceptions.RequestException as e:
logger.error(f"Failed to send summary message for {base_url}: {e}")
# Calculate and print the total time taken
end_time = datetime.datetime.now()
time_taken = end_time - start_time
logger.info(f"Total time taken: {time_taken}")
print(f"Total time taken: {time_taken}")
# Close the driver
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment