Created
April 6, 2023 04:07
-
-
Save motoyasu-saburi/7a2a8ef9e3405a90016fcb26a70d012a to your computer and use it in GitHub Desktop.
FIXME 部分は変更必須。あと、3.9 以上の Python で実行できるはず。 上げ直し
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import traceback | |
from dataclasses import dataclass | |
from time import sleep | |
from typing import Any, NoReturn | |
import requests | |
################ | |
# FIXME | |
cookie = "FIXME" # Github Browser Cookie | |
GRAPH_TOKEN = "FIXME" # Github API Token: https://github.com/settings/tokens. | |
################ | |
header = { | |
"Accept": "application/vnd.github+json", | |
"Authorization": f"Bearer {GRAPH_TOKEN}", | |
"X-GitHub-Api-Version": "2022-11-28" | |
} | |
api_url_base = "https://api.github.com/" | |
@dataclass | |
class CodeMetaData: | |
search_key: str | |
owner_repo: str # "OWNER/REPO" 形式 | |
path: str | |
language: str | |
@dataclass | |
class RepositoryMeta: | |
repo_url: str | |
is_fork: bool | |
forks: int | |
star: int | |
watchers_count: int | |
size: int # MEMO: これでフィルターすることで、プログラムなどのプロジェクトのみに絞れると感じたが、重要情報がメモられたメモファイルがフィルタされる可能性も考慮して使わない。 | |
@dataclass | |
class Repository: | |
repo_meta: RepositoryMeta | |
code_meta: CodeMetaData | |
def search_code_metadata(accumulator: list[CodeMetaData], keyword: str, page=1) -> list[CodeMetaData]: | |
""" | |
Github Browser の Code Search ページで叩かれる API を直接 Call する。 | |
(New Code Search は API が公開されてない) | |
ページがまだ存在する場合は、再実行し、最終的に合算したデータを送る | |
""" | |
sleep(1) # 一応非公開 API なので、お行儀よく 1秒毎にリクエスト | |
url = f"https://github.com/search?q=%2F%40{keyword}%2F&type=code&p={page}" | |
header = { # 必要ないやつもいっぱいあるだろうけど、一旦 Browser から直接コピーしたやつ | |
"accept": "application/json", | |
"accept-language": "ja,en-US;q=0.9,en;q=0.8", | |
"sec-ch-ua": "\"Chromium\";v=\"110\", \"Not A(Brand\";v=\"24\", \"Google Chrome\";v=\"110\"", | |
"sec-ch-ua-mobile": "?0", | |
"sec-ch-ua-platform": "\"macOS\"", | |
"sec-fetch-dest": "empty", | |
"sec-fetch-mode": "cors", | |
"sec-fetch-site": "same-origin", | |
"x-github-target": "dotcom", | |
"x-requested-with": "XMLHttpRequest", | |
"cookie": cookie, | |
"Referrer-Policy": "strict-origin-when-cross-origin", | |
"body": None, | |
"method": "GET" | |
} | |
res = requests.get(url, headers=header) | |
if res.status_code == 429: # rate limit. retry | |
sleep(61) | |
return search_code_metadata(accumulator, keyword, page=page) | |
try: | |
j = json.loads(res.content) | |
# API Doc がないので、 Schema は API の結果を直接参照. | |
result_codes: list[Any] = j["payload"]["results"] | |
result = list(map(lambda c: CodeMetaData( | |
search_key=keyword, | |
owner_repo=c["repo_nwo"], | |
path=c["path"], | |
language=c["language_name"] | |
), result_codes)) | |
current_page = j["payload"]["page"] | |
page_count = j["payload"]["page_count"] | |
result.extend(accumulator) | |
if current_page >= page_count: | |
return result | |
# まだ Page があるなら再実行 | |
return search_code_metadata(result, keyword, page=current_page + 1) | |
except Exception: | |
print(traceback.format_exc()) | |
def get_repo_metadata(owner: str, repo: str) -> RepositoryMeta: | |
""" | |
Repo の Metadata (主に Star 数など)を取得する | |
https://docs.github.com/ja/rest/repos/repos?apiVersion=2022-11-28#get-a-repository | |
""" | |
url = api_url_base + f"repos/{owner}/{repo}" | |
try: | |
res = requests.get(url, headers=header) | |
if res.status_code == 403 and b"rate limit" in res.content: | |
print("search_repos() rate limit") | |
sleep(61) | |
return get_repo_metadata(owner, repo) | |
else: | |
j = json.loads(res.content) | |
return RepositoryMeta( | |
repo_url=j["html_url"], | |
forks=j["forks_count"], | |
is_fork=j["fork"], | |
star=j["stargazers_count"], | |
watchers_count=j["watchers"], | |
size=j["size"] | |
) | |
except Exception: | |
print(traceback.format_exc()) | |
def filter_popular_repo(repos: list[Repository]) -> list[Repository]: | |
def __is_popular(r: Repository) -> bool: | |
if r.repo_meta.forks >= 2: | |
return True | |
if r.repo_meta.star >= 2: | |
return True | |
if r.repo_meta.watchers_count >= 2: | |
return True | |
return False | |
return list(filter(lambda r: not __is_popular(r), repos)) | |
def filter_fork_repo(repos: list[Repository]) -> list[Repository]: | |
def __is_fork(r: Repository) -> bool: | |
if r.repo_meta.is_fork: | |
return True | |
return False | |
return list(filter(lambda r: not __is_fork(r), repos)) | |
def create_report(repos: list[Repository], filename: str) -> NoReturn: | |
# 現在のプロジェクト直下にある `report` ディレクトリにレポート書き出し | |
report_dir = os.getcwd() + "/report" | |
with open(report_dir + f"/{filename}.csv", "a") as f: | |
f.write("url,path,searchkey,owner\n") | |
for r in repos: | |
f.writelines(f"{r.repo_meta.repo_url},{r.code_meta.path},{r.code_meta.search_key},{r.code_meta.owner_repo.split('/')[0]}\n") | |
if __name__ == '__main__': | |
# FIXME 検索するキーワード。雑に作ってるので `@example.com` で調べたい場合、 `example.com` をここに入力。 | |
# 無理やり prefix の @ を URL Encode した値とかが search_code_metadata() で挿入される。注意。 | |
keywords = [ | |
] | |
for keyword in keywords: | |
codes: list[CodeMetaData] = search_code_metadata([], keyword.replace(".", "\.")) | |
repositories: list[Repository] = [] | |
for c in codes: | |
o_r = c.owner_repo.split("/") | |
repo_meta = get_repo_metadata(owner=o_r[0], repo=o_r[1]) | |
repositories.append(Repository(code_meta=c, repo_meta=repo_meta)) | |
filtered_repositories = filter_popular_repo(repositories) | |
filtered_repositories = filter_fork_repo(filtered_repositories) | |
if len(filtered_repositories) >= 1: | |
create_report(filtered_repositories, keyword) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment