Created
March 24, 2025 05:44
-
-
Save lsankar4033/aa27a7772adf0a7dc043a0455ac82245 to your computer and use it in GitHub Desktop.
tiktok comment scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import random | |
import time | |
from pprint import pprint | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.firefox.service import Service | |
class MinimalFirefoxScraper: | |
def __init__(self, cookies_file=None): | |
"""Initialize Firefox WebDriver""" | |
options = webdriver.FirefoxOptions() | |
# Basic stealth settings | |
options.set_preference("dom.webdriver.enabled", False) | |
options.set_preference("useAutomationExtension", False) | |
# Initialize driver | |
service = Service() | |
self.driver = webdriver.Firefox(service=service, options=options) | |
self.driver.set_page_load_timeout(30) | |
if cookies_file: | |
self.load_cookies(cookies_file) | |
def format_cookie(self, cookie): | |
"""Convert Cookie Quick Manager format to Selenium format""" | |
# Extract domain from Host raw | |
domain = ( | |
cookie.get("Host raw", "") | |
.replace("https://", "") | |
.replace("http://", "") | |
.strip("./") | |
) | |
formatted = { | |
"name": cookie.get("Name raw", ""), | |
"value": cookie.get("Content raw", ""), | |
"domain": domain, | |
"path": cookie.get("Path raw", "/"), | |
"secure": cookie.get("Send for raw", "true").lower() == "true", | |
"httpOnly": cookie.get("HTTP only raw", "false").lower() == "true", | |
} | |
# Handle expiry | |
if "Expires raw" in cookie: | |
try: | |
formatted["expiry"] = int(cookie["Expires raw"]) | |
except ValueError: | |
pass | |
# Handle SameSite | |
same_site = cookie.get("SameSite raw", "") | |
if same_site == "no_restriction": | |
formatted["sameSite"] = "None" | |
elif same_site.lower() == "strict": | |
formatted["sameSite"] = "Strict" | |
else: | |
formatted["sameSite"] = "Lax" | |
return formatted | |
def load_cookies(self, file_path): | |
"""Load cookies from Cookie Quick Manager export""" | |
# First visit the site | |
self.driver.get("https://www.tiktok.com") | |
time.sleep(2) | |
try: | |
with open(file_path) as f: | |
cookies = json.load(f) | |
# Add each cookie | |
for cookie in cookies: | |
try: | |
formatted_cookie = self.format_cookie(cookie) | |
if formatted_cookie["name"] and formatted_cookie["value"]: | |
self.driver.add_cookie(formatted_cookie) | |
print(f"Added cookie: {formatted_cookie['name']}") | |
except Exception as e: | |
print( | |
f"Error adding cookie {cookie.get('Name raw', 'unknown')}: {str(e)}" | |
) | |
print("Finished loading cookies") | |
# Refresh to apply cookies | |
self.driver.refresh() | |
time.sleep(2) | |
except Exception as e: | |
print(f"Error loading cookies: {str(e)}") | |
raise | |
def random_sleep(self, min_sec=1, max_sec=3): | |
"""Sleep for a random duration""" | |
time.sleep(random.uniform(min_sec, max_sec)) | |
def scroll_to_element(self, element): | |
"""Scroll element into view with a smooth motion""" | |
self.driver.execute_script( | |
"arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", | |
element, | |
) | |
self.random_sleep(0.5, 1) | |
def extract_comments(self, url, max_comments=None, scroll_attempts=10): | |
"""Extract all level-1 comments from a page""" | |
print(f"Loading page: {url}") | |
self.driver.get(url) | |
self.random_sleep(3, 5) # Wait for initial load | |
comments = [] | |
last_height = self.driver.execute_script("return document.body.scrollHeight") | |
scroll_count = 0 | |
seen_comments = set() # Track unique comments | |
print("Starting comment extraction...") | |
while scroll_count < scroll_attempts: | |
try: | |
# Find all level-1 comments | |
comment_elements = self.driver.find_elements( | |
By.CSS_SELECTOR, '[data-e2e="comment-level-1"]' | |
) | |
# Process new comments | |
for element in comment_elements: | |
try: | |
# Get comment text | |
comment_text = element.text.strip() | |
# Skip if we've seen this comment or it's empty | |
if not comment_text or comment_text in seen_comments: | |
continue | |
# Add to our collections | |
comments.append(comment_text) | |
seen_comments.add(comment_text) | |
# Scroll to this comment (makes loading look more natural) | |
self.scroll_to_element(element) | |
print(f"Found comment {len(comments)}: {comment_text[:50]}...") | |
# Check if we've hit our limit | |
if max_comments and len(comments) >= max_comments: | |
print(f"Reached maximum comments limit: {max_comments}") | |
return comments | |
except Exception as e: | |
print(f"Error processing comment: {str(e)}") | |
continue | |
# Scroll down | |
self.driver.execute_script( | |
"window.scrollTo(0, document.body.scrollHeight);" | |
) | |
self.random_sleep(1, 2) | |
# Check if we've reached the bottom | |
new_height = self.driver.execute_script( | |
"return document.body.scrollHeight" | |
) | |
if new_height == last_height: | |
scroll_count += 1 | |
print( | |
f"No new content loaded. Attempt {scroll_count}/{scroll_attempts}" | |
) | |
else: | |
scroll_count = 0 # Reset counter if we found new content | |
print("Found new content. Continuing to scroll...") | |
last_height = new_height | |
except Exception as e: | |
print(f"Error during scrolling: {str(e)}") | |
scroll_count += 1 | |
print(f"Finished extracting comments. Found {len(comments)} unique comments.") | |
return comments | |
def close(self): | |
"""Close the browser""" | |
self.driver.quit() | |
TEST_POST_URL = "https://www.tiktok.com/@tjrtrades/video/7379601136627518751" | |
COOKIES_FILE = "data/tiktok_cookies_firefox.json" | |
def main(): | |
scraper = None | |
try: | |
# Initialize scraper with cookies | |
scraper = MinimalFirefoxScraper(cookies_file=COOKIES_FILE) | |
# Try loading a page | |
url = TEST_POST_URL # Replace with actual username | |
comments = scraper.extract_comments(url) | |
pprint(comments) | |
except Exception as e: | |
print(f"An error occurred: {str(e)}") | |
finally: | |
if scraper: | |
scraper.close() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment