Created
July 19, 2024 07:22
-
-
Save gersona/3c1e2044813d406b424c9d796c9fafc7 to your computer and use it in GitHub Desktop.
A crawler for hosted.weblate.org to check if the search feature correctly filters out strings that don't have screenshots
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from urllib.parse import urljoin, urlencode, urlunparse | |
import grequests | |
import requests | |
import re | |
import json5 | |
from typing import Generator | |
from bs4 import BeautifulSoup | |
from concurrent.futures import ThreadPoolExecutor | |
def download_page(url: str) -> str: | |
print(f"Downloading {url}") | |
return requests.get(url).text | |
def async_get_pages(urls: list[str]) -> Generator: | |
def _get(_url): | |
print(f"Getting {_url}") | |
return grequests.get(_url) | |
return grequests.imap_enumerated([_get(u) for u in urls]) | |
BASE_URL = 'https://hosted.weblate.org/' | |
def write_list_to_file(results: set[str] | list[str], filename: str) -> None: | |
with open(filename, 'w') as f: | |
for result in results: | |
f.write(result + '\n') | |
def write_json_to_file(json_content: dict, filename: str) -> None: | |
with open(filename, 'w') as f: | |
json5.dump(json_content, f) | |
def load_projects_links() -> list[str]: | |
links: list[str] = [] | |
for i in range(1, 13): | |
projects_page = download_page(f'{BASE_URL}projects/?page={i}&limit=100') | |
soup = BeautifulSoup(projects_page, 'html.parser') | |
for link in soup.find_all('a', href=True): | |
if link['href'].startswith('/projects/'): | |
links.append(urljoin(BASE_URL, link['href'])) | |
write_list_to_file(set(links), 'projects.txt') | |
PROJECT_LINKS_FILE = 'projects.txt' | |
def list_from_file_generator(filename: str): | |
with open(filename, 'r') as f: | |
for line in f: | |
yield line.strip() | |
PROJECT_TO_COMPONENT_FILENAME = 'project_to_components_map.json' | |
def build_project_to_components_map() -> dict[str, list[str]]: | |
project_to_components_map = {} | |
def get_component_links_for_project(project_url: str) -> list[str]: | |
project_code = project_url.split('/projects/')[1].strip('/') | |
project_page = download_page(project_url) | |
soup = BeautifulSoup(project_page, 'html.parser') | |
component_links: list[str] = [] | |
for component in soup.find('div', id="components").find_all('th', class_="object-link"): | |
component_links.append(urljoin(BASE_URL, component.find('a')['href'])) | |
project_to_components_map[project_code] = component_links | |
try: | |
with ThreadPoolExecutor() as executor: | |
executor.map(get_component_links_for_project, list_from_file_generator(PROJECT_LINKS_FILE)) | |
except: | |
raise | |
finally: | |
write_json_to_file(project_to_components_map, PROJECT_TO_COMPONENT_FILENAME) | |
def component_links_generator(): | |
with open(PROJECT_TO_COMPONENT_FILENAME, 'r') as f: | |
project_to_links = json5.load(f) | |
for project_code, links in project_to_links.items(): | |
for link in links: | |
if not link.endswith('/glossary/'): | |
yield link | |
SOURCE_STRINGS_LINKS_FILENAME = 'source_string_links.txt' | |
def collect_all_source_string_links(): | |
result: list[str] = [] | |
def find_source_strings_link_for_component(component_url: str) -> str: | |
page = download_page(component_url) | |
soup = BeautifulSoup(page, 'html.parser') | |
if neighbor_link := soup.find('span', title="This language is used for source strings.").parent.find('a'): | |
result.append(urljoin(BASE_URL, neighbor_link['href'])) | |
try: | |
with ThreadPoolExecutor() as executor: | |
executor.map(find_source_strings_link_for_component, component_links_generator()) | |
except: | |
print("Exited with error") | |
finally: | |
write_list_to_file(result, SOURCE_STRINGS_LINKS_FILENAME) | |
TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH = 'translation_links_with_inaccurate_search.txt' | |
def check_for_search_accuracy(): | |
SEARCH_QUERY = { | |
'q': 'NOT has:screenshot', | |
'sort_by': '-priority,position', | |
'offset': '1' | |
} | |
faulty_translation_links = [] | |
def has_screenshot(page_soup: BeautifulSoup): | |
return page_soup.find(string=re.compile("Screenshot context")) is not None | |
def check_for_screenshot(project_source_strings_link: str): | |
def translation_link(): | |
return project_source_strings_link.replace('/projects/', '/translate/') | |
first_translation_link = translation_link() + '?' + urlencode(SEARCH_QUERY) | |
search_result_page = download_page(first_translation_link) | |
if "No strings found!" not in search_result_page: | |
soup = BeautifulSoup(search_result_page, 'html.parser') | |
position_input = soup.find('div', id="position-input").text # e.g 1/1,235 | |
total_results = int(position_input.split('/')[1].strip().replace(",", "")) | |
if has_screenshot(soup): | |
print('found one at', first_translation_link) | |
faulty_translation_links.append(first_translation_link) | |
else: | |
found_one = False | |
offset_urls = [ | |
translation_link() + '?' + urlencode(SEARCH_QUERY | {'offset': offset}) | |
for offset in list(range(1, total_results))[:20] | |
] | |
for index, response in async_get_pages(offset_urls): | |
if found_one: | |
break | |
search_result_page = response.text | |
if "The translation has come to an end" in search_result_page: | |
break | |
soup = BeautifulSoup(search_result_page, 'html.parser') | |
if has_screenshot(soup): | |
print('found one at', offset_urls[index]) | |
faulty_translation_links.append(offset_urls[index]) | |
found_one = True | |
try: | |
with ThreadPoolExecutor() as executor: | |
executor.map(check_for_screenshot, list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME)) | |
# for link in list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME): | |
# check_for_screenshot(link) | |
# if len(faulty_translation_links) >= 3: | |
# break | |
except: | |
raise | |
finally: | |
print(faulty_translation_links) | |
write_list_to_file(faulty_translation_links, TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH) | |
def main(): | |
load_projects_links() | |
build_project_to_components_map() | |
collect_all_source_string_links() | |
check_for_search_accuracy() | |
if __name__ == '__main__': | |
import time | |
start = time.time() | |
main() | |
execution_time = time.time() - start | |
print(f"Execution time: {execution_time:.2f} seconds") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment