Created
October 8, 2025 11:40
-
-
Save ayoubzulfiqar/909959837b7578eb4770e9ae7b22380b to your computer and use it in GitHub Desktop.
Leetcode Dtata Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| import json | |
| import time | |
| import os | |
| from typing import Dict, List, Any | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import threading | |
| class LeetCodeDataFetcher: | |
| def __init__(self, max_workers=3): | |
| self.base_url = "https://leetcode.com/graphql" | |
| self.headers = { | |
| "Content-Type": "application/json", | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
| } | |
| self.session = requests.Session() | |
| self.session.headers.update(self.headers) | |
| self.max_workers = max_workers | |
| self.lock = threading.Lock() | |
| def fetch_problem_list(self) -> Dict[str, Any]: | |
| """Fetch the complete list of problems""" | |
| query = """ | |
| query problemsetQuestionList { | |
| problemsetQuestionList: questionList( | |
| categorySlug: "" | |
| limit: -1 | |
| skip: 0 | |
| filters: {} | |
| ) { | |
| total: totalNum | |
| questions: data { | |
| acRate | |
| difficulty | |
| freqBar | |
| frontendQuestionId: questionFrontendId | |
| isFavor | |
| paidOnly: isPaidOnly | |
| status | |
| title | |
| titleSlug | |
| topicTags { | |
| name | |
| slug | |
| } | |
| } | |
| } | |
| } | |
| """ | |
| try: | |
| response = self.session.post(self.base_url, json={"query": query}) | |
| response.raise_for_status() | |
| return response.json() | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching problem list: {e}") | |
| return {"data": {"problemsetQuestionList": {"total": 0, "questions": []}}} | |
| def fetch_single_problem(self, question: Dict[str, Any]) -> Dict[str, Any]: | |
| """Fetch detailed content for a single problem""" | |
| try: | |
| if question["paidOnly"]: | |
| return { | |
| "basic_info": question, | |
| "detailed_content": None, | |
| "is_paid": True, | |
| "error": "Paid problem - no detailed content available" | |
| } | |
| query = """ | |
| query questionContent($titleSlug: String!) { | |
| question(titleSlug: $titleSlug) { | |
| content | |
| codeDefinition | |
| codeSnippets { | |
| lang | |
| langSlug | |
| code | |
| } | |
| difficulty | |
| frontendQuestionId: questionFrontendId | |
| paidOnly: isPaidOnly | |
| title | |
| titleSlug | |
| topicTags { | |
| name | |
| slug | |
| } | |
| hints | |
| exampleTestcases | |
| metaData | |
| stats | |
| similarQuestions | |
| } | |
| } | |
| """ | |
| variables = {"titleSlug": question["titleSlug"]} | |
| response = self.session.post(self.base_url, json={"query": query, "variables": variables}) | |
| response.raise_for_status() | |
| detailed_content = response.json() | |
| return { | |
| "basic_info": question, | |
| "detailed_content": detailed_content["data"]["question"] if detailed_content["data"] else None, | |
| "is_paid": False | |
| } | |
| except Exception as e: | |
| return { | |
| "basic_info": question, | |
| "detailed_content": None, | |
| "is_paid": question["paidOnly"], | |
| "error": str(e) | |
| } | |
| def fetch_all_problems_concurrent(self) -> Dict[str, Any]: | |
| """Fetch all problems concurrently""" | |
| print("Fetching problem list...") | |
| problem_list = self.fetch_problem_list() | |
| all_problems = { | |
| "metadata": { | |
| "total_problems": problem_list["data"]["problemsetQuestionList"]["total"], | |
| "fetch_time": time.strftime("%Y-%m-%d %H:%M:%S"), | |
| "method": "concurrent_fetch" | |
| }, | |
| "problems": [] | |
| } | |
| questions = problem_list["data"]["problemsetQuestionList"]["questions"] | |
| total = len(questions) | |
| print(f"Fetching detailed content for {total} problems concurrently (max {self.max_workers} workers)...") | |
| completed = 0 | |
| with ThreadPoolExecutor(max_workers=self.max_workers) as executor: | |
| # Submit all tasks | |
| future_to_question = { | |
| executor.submit(self.fetch_single_problem, question): question | |
| for question in questions | |
| } | |
| # Collect results as they complete | |
| for future in as_completed(future_to_question): | |
| result = future.result() | |
| with self.lock: | |
| all_problems["problems"].append(result) | |
| completed += 1 | |
| # Print progress | |
| question_title = result["basic_info"]["title"] | |
| if result.get("is_paid"): | |
| status = "PAID" | |
| elif "error" in result: | |
| status = "ERROR" | |
| else: | |
| status = "SUCCESS" | |
| print(f"Progress: {completed}/{total} - {question_title} [{status}]") | |
| # Small delay to be respectful | |
| time.sleep(0.1) | |
| return all_problems | |
| def main(): | |
| print("Starting LeetCode data fetch...") | |
| # Initialize fetcher | |
| fetcher = LeetCodeDataFetcher(max_workers=3) # Conservative rate limiting | |
| # Fetch all data | |
| all_data = fetcher.fetch_all_problems_concurrent() | |
| # Save complete data | |
| output_file = "leetcode_complete_data.json" | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(all_data, f, ensure_ascii=False, indent=2) | |
| print(f"\nComplete data saved to {output_file}") | |
| # Create enhanced summary | |
| free_problems = [p for p in all_data["problems"] if not p.get("is_paid") and "error" not in p] | |
| paid_problems = [p for p in all_data["problems"] if p.get("is_paid")] | |
| failed_fetches = [p for p in all_data["problems"] if "error" in p and not p.get("is_paid")] | |
| summary = { | |
| "metadata": all_data["metadata"], | |
| "summary": { | |
| "total_problems": len(all_data["problems"]), | |
| "free_problems_with_content": len(free_problems), | |
| "paid_problems": len(paid_problems), | |
| "failed_free_problems": len(failed_fetches), | |
| "difficulties": {}, | |
| "tags": {}, | |
| "stats": { | |
| "easy_count": 0, | |
| "medium_count": 0, | |
| "hard_count": 0 | |
| } | |
| } | |
| } | |
| # Count by difficulty and tags for free problems only | |
| for problem in free_problems: | |
| if problem.get("basic_info"): | |
| difficulty = problem["basic_info"]["difficulty"] | |
| summary["summary"]["difficulties"][difficulty] = summary["summary"]["difficulties"].get(difficulty, 0) + 1 | |
| # Update stats by difficulty | |
| if difficulty.lower() == "easy": | |
| summary["summary"]["stats"]["easy_count"] += 1 | |
| elif difficulty.lower() == "medium": | |
| summary["summary"]["stats"]["medium_count"] += 1 | |
| elif difficulty.lower() == "hard": | |
| summary["summary"]["stats"]["hard_count"] += 1 | |
| # Count tags | |
| if problem["basic_info"]["topicTags"]: | |
| for tag in problem["basic_info"]["topicTags"]: | |
| tag_name = tag["name"] | |
| summary["summary"]["tags"][tag_name] = summary["summary"]["tags"].get(tag_name, 0) + 1 | |
| summary_file = "leetcode_summary.json" | |
| with open(summary_file, "w", encoding="utf-8") as f: | |
| json.dump(summary, f, ensure_ascii=False, indent=2) | |
| print(f"Summary saved to {summary_file}") | |
| # Print corrected final stats | |
| print("\nFinal Statistics:") | |
| print(f"- Total problems: {summary['summary']['total_problems']}") | |
| print(f"- Free problems with content: {summary['summary']['free_problems_with_content']}") | |
| print(f"- Paid problems: {summary['summary']['paid_problems']}") | |
| print(f"- Failed free problem fetches: {summary['summary']['failed_free_problems']}") | |
| print(f"- Difficulty breakdown:") | |
| print(f" - Easy: {summary['summary']['stats']['easy_count']}") | |
| print(f" - Medium: {summary['summary']['medium_count']}") | |
| print(f" - Hard: {summary['summary']['hard_count']}") | |
| # Save separate files for easier GitHub Pages access | |
| # Free problems only | |
| free_problems_data = { | |
| "metadata": all_data["metadata"], | |
| "problems": free_problems | |
| } | |
| with open("leetcode_free_problems.json", "w", encoding="utf-8") as f: | |
| json.dump(free_problems_data, f, ensure_ascii=False, indent=2) | |
| print("Free problems data saved to leetcode_free_problems.json") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment