Created
May 30, 2025 18:21
-
-
Save makslevental/62f49bf8e482370792a2475939006f90 to your computer and use it in GitHub Desktop.
cluster lc by soln strat
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
from collections import defaultdict | |
from pprint import pprint | |
import hdbscan | |
import leetcode | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.preprocessing import normalize | |
configuration = leetcode.Configuration() | |
configuration.api_key["x-csrftoken"] = csrf_token | |
configuration.api_key["csrftoken"] = csrf_token | |
configuration.api_key["LEETCODE_SESSION"] = leetcode_session | |
configuration.api_key["Referer"] = "https://leetcode.com" | |
configuration.debug = False | |
api_instance = leetcode.DefaultApi(leetcode.ApiClient(configuration)) | |
def get_num_favs(favorite_slug): | |
operationName = "favoriteDetailV2" | |
graphql_request = dict( | |
query=""" | |
query favoriteDetailV2($favoriteSlug: String!) { | |
favoriteDetailV2(favoriteSlug: $favoriteSlug) { | |
coverUrl | |
coverEmoji | |
coverBackgroundColor | |
description | |
creator { | |
realName | |
userAvatar | |
userSlug | |
} | |
hasCurrentQuestion | |
isPublicFavorite | |
lastQuestionAddedAt | |
name | |
questionNumber | |
slug | |
isDefaultList | |
favoriteType | |
lastModified: lastQuestionAddedAt | |
languageTagSlug | |
filtersInfo | |
sortByInfo | |
visibleFilters | |
collectCount | |
} | |
} | |
""", | |
variables={"favoriteSlug": favorite_slug}, | |
operationName=operationName, | |
) | |
jsn = api_instance.graphql_post( | |
body=graphql_request, _preload_content=False | |
).json()["data"][operationName] | |
return jsn["questionNumber"] | |
def get_questions(list_id): | |
num_q = get_num_favs(list_id) | |
graphql_request = leetcode.models.graphql_query.GraphqlQuery( | |
query=""" | |
query problemsetQuestionList($categorySlug: String, $limit: Int, $skip: Int, $filters: QuestionListFilterInput) { | |
problemsetQuestionList: questionList( | |
categorySlug: $categorySlug | |
limit: $limit | |
skip: $skip | |
filters: $filters | |
) { | |
questions: data { | |
questionFrontendId | |
title | |
titleSlug | |
categoryTitle | |
freqBar | |
content | |
isPaidOnly | |
difficulty | |
likes | |
dislikes | |
topicTags { | |
name | |
slug | |
} | |
stats | |
hints | |
} | |
} | |
} | |
""", | |
variables=leetcode.models.graphql_query_problemset_question_list_variables.GraphqlQueryProblemsetQuestionListVariables( | |
category_slug="", | |
limit=num_q, | |
skip=0, | |
filters=leetcode.models.graphql_query_problemset_question_list_variables_filter_input.GraphqlQueryProblemsetQuestionListVariablesFilterInput( | |
list_id=list_id | |
), | |
), | |
operation_name="problemsetQuestionList", | |
) | |
time.sleep(2) # Leetcode has a rate limiter | |
return api_instance.graphql_post( | |
body=graphql_request | |
).data.problemset_question_list.questions | |
def get_solution_tags(problem_slug: str): | |
query = """ | |
query ugcArticleOfficialSolutionArticle($questionSlug: String!) { | |
ugcArticleOfficialSolutionArticle(questionSlug: $questionSlug) { | |
content | |
} | |
} | |
""" | |
graphql_request = { | |
"query": query, | |
"variables": {"questionSlug": problem_slug}, | |
"operationName": "ugcArticleOfficialSolutionArticle", | |
} | |
data = api_instance.graphql_post(body=graphql_request, _preload_content=False) | |
if data: | |
data = data.json()["data"]["ugcArticleOfficialSolutionArticle"] | |
if not data or "content" not in data: | |
print(f"{problem_slug=} missing soln") | |
return None | |
rawmd = data["content"] | |
matches = [ | |
m.split(":")[-1].strip().replace('"', "") | |
for m in re.findall(r"### Approach ([^<\n]*)", rawmd) | |
] | |
for m in matches: | |
if m and "<" in m: | |
print(problem_slug) | |
return matches | |
return None | |
def get_all_soln_strats(list_id): | |
all_text = [] | |
for i, q in enumerate(get_questions(list_id)): | |
all_text.extend([t.name for t in q.topic_tags]) | |
soln = get_solution_tags(q.title_slug) | |
if soln: | |
for s in soln: | |
all_text.append(s) | |
time.sleep(0.2) | |
return all_text | |
def cluster_by_soln_strat(list_id): | |
all_text = get_all_soln_strats(list_id) | |
vectorizer = TfidfVectorizer() | |
X = vectorizer.fit_transform(all_text) | |
X_normalized = normalize(X) | |
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, metric="euclidean") | |
cluster_labels = clusterer.fit_predict(X_normalized) | |
clusters = defaultdict(list) | |
for sentence, label in zip(all_text, cluster_labels): | |
clusters[label].append(sentence) | |
return clusters | |
fav_list_id = "2jvrtw0j" | |
clusters = cluster_by_soln_strat(fav_list_id) | |
pprint(clusters) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment