Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save gersona/3c1e2044813d406b424c9d796c9fafc7 to your computer and use it in GitHub Desktop.
Save gersona/3c1e2044813d406b424c9d796c9fafc7 to your computer and use it in GitHub Desktop.
A crawler for hosted.weblate.org to check if the search feature correctly filters out strings that don't have screenshots
#!/usr/bin/env python
from urllib.parse import urljoin, urlencode, urlunparse
import grequests
import requests
import re
import json5
from typing import Generator
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def download_page(url: str) -> str:
print(f"Downloading {url}")
return requests.get(url).text
def async_get_pages(urls: list[str]) -> Generator:
def _get(_url):
print(f"Getting {_url}")
return grequests.get(_url)
return grequests.imap_enumerated([_get(u) for u in urls])
BASE_URL = 'https://hosted.weblate.org/'
def write_list_to_file(results: set[str] | list[str], filename: str) -> None:
with open(filename, 'w') as f:
for result in results:
f.write(result + '\n')
def write_json_to_file(json_content: dict, filename: str) -> None:
with open(filename, 'w') as f:
json5.dump(json_content, f)
def load_projects_links() -> list[str]:
links: list[str] = []
for i in range(1, 13):
projects_page = download_page(f'{BASE_URL}projects/?page={i}&limit=100')
soup = BeautifulSoup(projects_page, 'html.parser')
for link in soup.find_all('a', href=True):
if link['href'].startswith('/projects/'):
links.append(urljoin(BASE_URL, link['href']))
write_list_to_file(set(links), 'projects.txt')
PROJECT_LINKS_FILE = 'projects.txt'
def list_from_file_generator(filename: str):
with open(filename, 'r') as f:
for line in f:
yield line.strip()
PROJECT_TO_COMPONENT_FILENAME = 'project_to_components_map.json'
def build_project_to_components_map() -> dict[str, list[str]]:
project_to_components_map = {}
def get_component_links_for_project(project_url: str) -> list[str]:
project_code = project_url.split('/projects/')[1].strip('/')
project_page = download_page(project_url)
soup = BeautifulSoup(project_page, 'html.parser')
component_links: list[str] = []
for component in soup.find('div', id="components").find_all('th', class_="object-link"):
component_links.append(urljoin(BASE_URL, component.find('a')['href']))
project_to_components_map[project_code] = component_links
try:
with ThreadPoolExecutor() as executor:
executor.map(get_component_links_for_project, list_from_file_generator(PROJECT_LINKS_FILE))
except:
raise
finally:
write_json_to_file(project_to_components_map, PROJECT_TO_COMPONENT_FILENAME)
def component_links_generator():
with open(PROJECT_TO_COMPONENT_FILENAME, 'r') as f:
project_to_links = json5.load(f)
for project_code, links in project_to_links.items():
for link in links:
if not link.endswith('/glossary/'):
yield link
SOURCE_STRINGS_LINKS_FILENAME = 'source_string_links.txt'
def collect_all_source_string_links():
result: list[str] = []
def find_source_strings_link_for_component(component_url: str) -> str:
page = download_page(component_url)
soup = BeautifulSoup(page, 'html.parser')
if neighbor_link := soup.find('span', title="This language is used for source strings.").parent.find('a'):
result.append(urljoin(BASE_URL, neighbor_link['href']))
try:
with ThreadPoolExecutor() as executor:
executor.map(find_source_strings_link_for_component, component_links_generator())
except:
print("Exited with error")
finally:
write_list_to_file(result, SOURCE_STRINGS_LINKS_FILENAME)
TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH = 'translation_links_with_inaccurate_search.txt'
def check_for_search_accuracy():
SEARCH_QUERY = {
'q': 'NOT has:screenshot',
'sort_by': '-priority,position',
'offset': '1'
}
faulty_translation_links = []
def has_screenshot(page_soup: BeautifulSoup):
return page_soup.find(string=re.compile("Screenshot context")) is not None
def check_for_screenshot(project_source_strings_link: str):
def translation_link():
return project_source_strings_link.replace('/projects/', '/translate/')
first_translation_link = translation_link() + '?' + urlencode(SEARCH_QUERY)
search_result_page = download_page(first_translation_link)
if "No strings found!" not in search_result_page:
soup = BeautifulSoup(search_result_page, 'html.parser')
position_input = soup.find('div', id="position-input").text # e.g 1/1,235
total_results = int(position_input.split('/')[1].strip().replace(",", ""))
if has_screenshot(soup):
print('found one at', first_translation_link)
faulty_translation_links.append(first_translation_link)
else:
found_one = False
offset_urls = [
translation_link() + '?' + urlencode(SEARCH_QUERY | {'offset': offset})
for offset in list(range(1, total_results))[:20]
]
for index, response in async_get_pages(offset_urls):
if found_one:
break
search_result_page = response.text
if "The translation has come to an end" in search_result_page:
break
soup = BeautifulSoup(search_result_page, 'html.parser')
if has_screenshot(soup):
print('found one at', offset_urls[index])
faulty_translation_links.append(offset_urls[index])
found_one = True
try:
with ThreadPoolExecutor() as executor:
executor.map(check_for_screenshot, list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME))
# for link in list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME):
# check_for_screenshot(link)
# if len(faulty_translation_links) >= 3:
# break
except:
raise
finally:
print(faulty_translation_links)
write_list_to_file(faulty_translation_links, TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH)
def main():
load_projects_links()
build_project_to_components_map()
collect_all_source_string_links()
check_for_search_accuracy()
if __name__ == '__main__':
import time
start = time.time()
main()
execution_time = time.time() - start
print(f"Execution time: {execution_time:.2f} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment