Created
March 13, 2015 07:55
-
-
Save google-code-export/73a6c635646545ed7c3c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import sys | |
tags = ['', 'python', 'javascript', 'django', 'web', 'google', 'java', 'ajax', | |
'rails', 'plugin', 'android', 'cplusplus', 'mysql', 'dotnet', 'game', | |
'appengine', 'php', 'flash', 'jquery', 'database', 'gwt'] | |
seen_tags = set(tags) | |
projects = set() | |
def get_tag(): | |
i = 0 | |
while i < len(tags): | |
yield tags[i] | |
i += 1 | |
def add_tag(tag): | |
if tag not in seen_tags: | |
tags.append(tag) | |
seen_tags.add(tag) | |
SEARCH_URL = 'https://code.google.com/hosting/search?q=label%3A' | |
for tag in get_tag(): | |
r = requests.get(SEARCH_URL+tag) | |
if '&' not in tag: | |
try: | |
num_result = int(re.search('Results \d+ - \d+ of (\d+)', r.text).group(1)) | |
except: | |
print(':( could not get {}'.format(SEARCH_URL+tag), file=sys.stderr) | |
continue | |
for i in range(50, num_result, 10): | |
add_tag(tag+'&start='+str(i)) | |
continue | |
new_tags = set(map(str.lower, re.findall('<a href="/hosting/search\?q=label:([^"]+)">', r.text))) | |
for tag in new_tags: | |
add_tag(tag) | |
new_projects = set(re.findall('<a href="/p/([^/"]+)/">', r.text)) - projects | |
if new_projects: | |
print('https://code.google.com/export-to-github/export?project='+'\nhttps://code.google.com/export-to-github/export?project='.join(new_projects)) | |
projects |= new_projects |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment