gersona · July 19, 2024 07:22
diff --git a/weblate-public-projects-search-crawler.py b/weblate-public-projects-search-crawler.py
 #!/usr/bin/env python
 from urllib.parse import urljoin, urlencode, urlunparse
 import grequests
 import requests
 import re
 import json5
 from typing import Generator
 from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor

 def download_page(url: str) -> str:
    print(f"Downloading {url}")
    return requests.get(url).text


 def async_get_pages(urls: list[str]) -> Generator:
    def _get(_url):
        print(f"Getting {_url}")
        return grequests.get(_url)
    return grequests.imap_enumerated([_get(u) for u in urls])


 BASE_URL = 'https://hosted.weblate.org/'

 def write_list_to_file(results: set[str] | list[str], filename: str) -> None:
    with open(filename, 'w') as f:
        for result in results:
            f.write(result + '\n')


 def write_json_to_file(json_content: dict, filename: str) -> None:
    with open(filename, 'w') as f:
        json5.dump(json_content, f)


 def load_projects_links() -> list[str]:
    links: list[str] = []
    for i in range(1, 13):
        projects_page = download_page(f'{BASE_URL}projects/?page={i}&limit=100')
        soup = BeautifulSoup(projects_page, 'html.parser')
        for link in soup.find_all('a', href=True):
            if link['href'].startswith('/projects/'):
                links.append(urljoin(BASE_URL, link['href']))
    write_list_to_file(set(links), 'projects.txt')


 PROJECT_LINKS_FILE = 'projects.txt'

 def list_from_file_generator(filename: str):
    with open(filename, 'r') as f:
        for line in f:
            yield line.strip()


 PROJECT_TO_COMPONENT_FILENAME = 'project_to_components_map.json'

 def build_project_to_components_map() -> dict[str, list[str]]:
    project_to_components_map = {}

    def get_component_links_for_project(project_url: str) -> list[str]:
        project_code = project_url.split('/projects/')[1].strip('/')
        project_page = download_page(project_url)
        soup = BeautifulSoup(project_page, 'html.parser')
        component_links: list[str] = []
        for component in soup.find('div', id="components").find_all('th', class_="object-link"):
            component_links.append(urljoin(BASE_URL, component.find('a')['href']))

        project_to_components_map[project_code] = component_links
        
    try:
        with ThreadPoolExecutor() as executor:
            executor.map(get_component_links_for_project, list_from_file_generator(PROJECT_LINKS_FILE))
    except: 
        raise
    finally:
        write_json_to_file(project_to_components_map, PROJECT_TO_COMPONENT_FILENAME)


 def component_links_generator():
    with open(PROJECT_TO_COMPONENT_FILENAME, 'r') as f:
        project_to_links = json5.load(f)
        for project_code, links in project_to_links.items():
            for link in links:
                if not link.endswith('/glossary/'):
                    yield link

 SOURCE_STRINGS_LINKS_FILENAME = 'source_string_links.txt'

 def collect_all_source_string_links():
    result: list[str] = []

    def find_source_strings_link_for_component(component_url: str) -> str:
        page = download_page(component_url)
        soup = BeautifulSoup(page, 'html.parser')
        if neighbor_link := soup.find('span', title="This language is used for source strings.").parent.find('a'):
            result.append(urljoin(BASE_URL, neighbor_link['href']))
    try:
        with ThreadPoolExecutor() as executor:
            executor.map(find_source_strings_link_for_component, component_links_generator())
    except:
        print("Exited with error")
    finally:
        write_list_to_file(result, SOURCE_STRINGS_LINKS_FILENAME)


 TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH = 'translation_links_with_inaccurate_search.txt'

 def check_for_search_accuracy():
    SEARCH_QUERY = {
        'q': 'NOT has:screenshot',
        'sort_by': '-priority,position',
        'offset': '1'
    }
    faulty_translation_links = []

    def has_screenshot(page_soup: BeautifulSoup):
        return page_soup.find(string=re.compile("Screenshot context")) is not None

    def check_for_screenshot(project_source_strings_link: str):
        def translation_link():
            return project_source_strings_link.replace('/projects/', '/translate/')

        first_translation_link = translation_link() + '?' + urlencode(SEARCH_QUERY)
        search_result_page = download_page(first_translation_link)
        if "No strings found!" not in search_result_page:
            soup = BeautifulSoup(search_result_page, 'html.parser')
            position_input = soup.find('div', id="position-input").text  # e.g 1/1,235
            total_results = int(position_input.split('/')[1].strip().replace(",", ""))

            if has_screenshot(soup):
                print('found one at', first_translation_link)
                faulty_translation_links.append(first_translation_link)
            else:
                found_one = False
                offset_urls = [
                    translation_link() + '?' + urlencode(SEARCH_QUERY | {'offset': offset})
                    for offset in list(range(1, total_results))[:20]
                ]
                for index, response in async_get_pages(offset_urls):
                    if found_one:
                        break
                    search_result_page = response.text
                    if "The translation has come to an end" in search_result_page:
                        break
                    soup = BeautifulSoup(search_result_page, 'html.parser')
                    if has_screenshot(soup):
                        print('found one at', offset_urls[index])
                        faulty_translation_links.append(offset_urls[index])
                        found_one = True

    try:
        with ThreadPoolExecutor() as executor:
            executor.map(check_for_screenshot, list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME))
        # for link in list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME):
        #     check_for_screenshot(link)
        #     if len(faulty_translation_links) >= 3:
        #         break 
    except:
        raise
    finally:
        print(faulty_translation_links)
        write_list_to_file(faulty_translation_links, TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH)


 def main():
    load_projects_links()
    build_project_to_components_map()
    collect_all_source_string_links()
    check_for_search_accuracy()


 if __name__ == '__main__':
    import time
    start = time.time()
    main()
    execution_time = time.time() - start
    print(f"Execution time: {execution_time:.2f} seconds")
	#!/usr/bin/env python
	from urllib.parse import urljoin, urlencode, urlunparse
	import grequests
	import requests
	import re
	import json5
	from typing import Generator
	from bs4 import BeautifulSoup
	from concurrent.futures import ThreadPoolExecutor

	def download_page(url: str) -> str:
	print(f"Downloading {url}")
	return requests.get(url).text


	def async_get_pages(urls: list[str]) -> Generator:
	def _get(_url):
	print(f"Getting {_url}")
	return grequests.get(_url)
	return grequests.imap_enumerated([_get(u) for u in urls])


	BASE_URL = 'https://hosted.weblate.org/'

	def write_list_to_file(results: set[str] \| list[str], filename: str) -> None:
	with open(filename, 'w') as f:
	for result in results:
	f.write(result + '\n')


	def write_json_to_file(json_content: dict, filename: str) -> None:
	with open(filename, 'w') as f:
	json5.dump(json_content, f)


	def load_projects_links() -> list[str]:
	links: list[str] = []
	for i in range(1, 13):
	projects_page = download_page(f'{BASE_URL}projects/?page={i}&limit=100')
	soup = BeautifulSoup(projects_page, 'html.parser')
	for link in soup.find_all('a', href=True):
	if link['href'].startswith('/projects/'):
	links.append(urljoin(BASE_URL, link['href']))
	write_list_to_file(set(links), 'projects.txt')


	PROJECT_LINKS_FILE = 'projects.txt'

	def list_from_file_generator(filename: str):
	with open(filename, 'r') as f:
	for line in f:
	yield line.strip()


	PROJECT_TO_COMPONENT_FILENAME = 'project_to_components_map.json'

	def build_project_to_components_map() -> dict[str, list[str]]:
	project_to_components_map = {}

	def get_component_links_for_project(project_url: str) -> list[str]:
	project_code = project_url.split('/projects/')[1].strip('/')
	project_page = download_page(project_url)
	soup = BeautifulSoup(project_page, 'html.parser')
	component_links: list[str] = []
	for component in soup.find('div', id="components").find_all('th', class_="object-link"):
	component_links.append(urljoin(BASE_URL, component.find('a')['href']))

	project_to_components_map[project_code] = component_links

	try:
	with ThreadPoolExecutor() as executor:
	executor.map(get_component_links_for_project, list_from_file_generator(PROJECT_LINKS_FILE))
	except:
	raise
	finally:
	write_json_to_file(project_to_components_map, PROJECT_TO_COMPONENT_FILENAME)


	def component_links_generator():
	with open(PROJECT_TO_COMPONENT_FILENAME, 'r') as f:
	project_to_links = json5.load(f)
	for project_code, links in project_to_links.items():
	for link in links:
	if not link.endswith('/glossary/'):
	yield link

	SOURCE_STRINGS_LINKS_FILENAME = 'source_string_links.txt'

	def collect_all_source_string_links():
	result: list[str] = []

	def find_source_strings_link_for_component(component_url: str) -> str:
	page = download_page(component_url)
	soup = BeautifulSoup(page, 'html.parser')
	if neighbor_link := soup.find('span', title="This language is used for source strings.").parent.find('a'):
	result.append(urljoin(BASE_URL, neighbor_link['href']))
	try:
	with ThreadPoolExecutor() as executor:
	executor.map(find_source_strings_link_for_component, component_links_generator())
	except:
	print("Exited with error")
	finally:
	write_list_to_file(result, SOURCE_STRINGS_LINKS_FILENAME)


	TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH = 'translation_links_with_inaccurate_search.txt'

	def check_for_search_accuracy():
	SEARCH_QUERY = {
	'q': 'NOT has:screenshot',
	'sort_by': '-priority,position',
	'offset': '1'
	}
	faulty_translation_links = []

	def has_screenshot(page_soup: BeautifulSoup):
	return page_soup.find(string=re.compile("Screenshot context")) is not None

	def check_for_screenshot(project_source_strings_link: str):
	def translation_link():
	return project_source_strings_link.replace('/projects/', '/translate/')

	first_translation_link = translation_link() + '?' + urlencode(SEARCH_QUERY)
	search_result_page = download_page(first_translation_link)
	if "No strings found!" not in search_result_page:
	soup = BeautifulSoup(search_result_page, 'html.parser')
	position_input = soup.find('div', id="position-input").text # e.g 1/1,235
	total_results = int(position_input.split('/')[1].strip().replace(",", ""))

	if has_screenshot(soup):
	print('found one at', first_translation_link)
	faulty_translation_links.append(first_translation_link)
	else:
	found_one = False
	offset_urls = [
	translation_link() + '?' + urlencode(SEARCH_QUERY \| {'offset': offset})
	for offset in list(range(1, total_results))[:20]
	]
	for index, response in async_get_pages(offset_urls):
	if found_one:
	break
	search_result_page = response.text
	if "The translation has come to an end" in search_result_page:
	break
	soup = BeautifulSoup(search_result_page, 'html.parser')
	if has_screenshot(soup):
	print('found one at', offset_urls[index])
	faulty_translation_links.append(offset_urls[index])
	found_one = True

	try:
	with ThreadPoolExecutor() as executor:
	executor.map(check_for_screenshot, list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME))
	# for link in list_from_file_generator(SOURCE_STRINGS_LINKS_FILENAME):
	# check_for_screenshot(link)
	# if len(faulty_translation_links) >= 3:
	# break
	except:
	raise
	finally:
	print(faulty_translation_links)
	write_list_to_file(faulty_translation_links, TRANSLATION_LINKS_WITHS_INACCURATE_SEARCH)


	def main():
	load_projects_links()
	build_project_to_components_map()
	collect_all_source_string_links()
	check_for_search_accuracy()


	if __name__ == '__main__':
	import time
	start = time.time()
	main()
	execution_time = time.time() - start
	print(f"Execution time: {execution_time:.2f} seconds")