Forked from ctlllll/longest_chinese_tokens_gpt4o.py
Last active
October 26, 2024 21:40
-
-
Save victorlin/2fa6ac60d257524f3ee26cfe5c471a08 to your computer and use it in GitHub Desktop.
Longest tokens per language in gpt4o
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import iso639 | |
import json | |
import langdetect | |
import tiktoken | |
REQUIRED_LANGUAGES = ["zh-cn"] | |
# Minimum for required languages | |
# Maximum for optional languages | |
TOKENS_PER_LANGUAGE = 20 | |
T = tiktoken.get_encoding("o200k_base") | |
tokens: list[tuple[int, int]] = [] | |
for token_index in range(T.n_vocab): | |
try: | |
tokens.append((token_index, len(T.decode([token_index])))) | |
except: | |
pass | |
# Sort by length | |
tokens.sort(key=lambda item: -item[1]) | |
longest_tokens: dict[str, list[str]] = {} | |
for token_index, _length in tokens: | |
token = T.decode([token_index]) | |
try: | |
language = langdetect.detect(token) | |
language = iso639.to_name(language) | |
except langdetect.lang_detect_exception.LangDetectException: | |
# Unknown language, skip entirely | |
continue | |
except iso639.NonExistentLanguageError: | |
# Language code can't be converted to English name, just use the code | |
pass | |
if language not in longest_tokens: | |
print(f"New language: {language}") | |
longest_tokens[language] = [] | |
if len(longest_tokens[language]) < TOKENS_PER_LANGUAGE: | |
longest_tokens[language].append(token) | |
if all(len(longest_tokens.get(language, [])) >= TOKENS_PER_LANGUAGE | |
for language in REQUIRED_LANGUAGES): | |
break | |
with open('2_longest_tokens.json', 'w') as json_file: | |
json.dump(longest_tokens, json_file, ensure_ascii=False, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"Dutch; Flemish": [ | |
"abcdefghijklmnopqrstuvwxyz", | |
" verantwoordelijkheid", | |
" verantwoordelijk", | |
" persoonsgegevens", | |
" maatschappelijke", | |
" Desenvolvimento", | |
" gespecialiseerd", | |
" investissements", | |
" oorspronkelijke", | |
" tentoonstelling", | |
" waarschijnlijk", | |
" renseignements", | |
" tegelijkertijd", | |
" internationaal", | |
" geïnteresseerd", | |
" investissement", | |
" independientes", | |
"’investissement", | |
" arrondissement", | |
" aantrekkelijke" | |
], | |
"Tagalog": [ | |
"ABCDEFGHIJKLMNOPQRSTUVWXYZ", | |
"ABCDEFGHIJKLMNOP", | |
" sustainability", | |
" Rehabilitation", | |
" organisational", | |
" organisations", | |
" multinational", | |
" Compatibility", | |
" pinakamahusay", | |
" Organisations", | |
" organisation", | |
" Organisation", | |
" manipulation", | |
" Availability", | |
"’organisation", | |
"Compatibility", | |
" manipulating", | |
" Disabilities", | |
" longstanding", | |
" multilingual" | |
], | |
"Thai": [ | |
" สำนักเลขานุการองค์กร", | |
" วิเคราะห์บอลวันนี้", | |
" แขวงคลองเตยเหนือ", | |
" นักลงทุนสัมพันธ์", | |
"เปิดอภิปรายทั่วไป", | |
" แสดงความคิดเห็น", | |
"【อ่านข้อความเต็ม", | |
" ถ่ายทอดสดฟุตบอล", | |
" อาคารจีเอ็มเอ็ม", | |
"อ่านข้อความเต็ม", | |
" กรุงเทพมหานครฯ", | |
" สำนักเลขานุการ", | |
" ฝ่ายขายออนไลน์", | |
"เติมเงินไทยฟรี", | |
" ส่งเงินบาทไทย", | |
"สดงความคิดเห็น", | |
" คาสิโนออนไลน์", | |
" กรุงเทพมหานคร", | |
" ติดต่อฝ่ายขาย", | |
" ฝ่ายขายรายการ" | |
], | |
"English": [ | |
" telecommunications", | |
".onreadystatechange", | |
" Telecommunications", | |
".githubusercontent", | |
" htmlspecialchars", | |
" characterization", | |
" transformational", | |
" characteristics", | |
" straightforward", | |
" Redistributions", | |
" internationally", | |
" professionalism", | |
" accomplishments", | |
" troubleshooting", | |
" Congratulations", | |
" transformations", | |
" synchronization", | |
" instrumentation", | |
" functionalities", | |
" congratulations" | |
], | |
"German": [ | |
" selbstverständlich", | |
" Wahrscheinlichkeit", | |
" unterschiedlichen", | |
" Herausforderungen", | |
" STDMETHODCALLTYPE", | |
" unterschiedliche", | |
" Dienstleistungen", | |
" misunderstanding", | |
" Veröffentlichung", | |
" Veranstaltungen", | |
" unterschiedlich", | |
" Schwangerschaft", | |
" beziehungsweise", | |
" Herausforderung", | |
" Voraussetzungen", | |
" Geschäftsführer", | |
" Bundesregierung", | |
" Digitalisierung", | |
" Ansprechpartner", | |
" Geschwindigkeit" | |
], | |
"Italian": [ | |
" significativamente", | |
" disproportionately", | |
" interdisciplinary", | |
" commercialization", | |
" Responsibilities", | |
" interoperability", | |
" professionnelle", | |
" Representatives", | |
" differentiation", | |
" confidentiality", | |
" interpretations", | |
" internationalen", | |
" definitivamente", | |
" automaticamente", | |
" competitiveness", | |
" personalization", | |
" internationales", | |
" diversification", | |
" multiprocessing", | |
" classifications" | |
], | |
"Portuguese": [ | |
" responsabilidades", | |
" responsabilidade", | |
" gastrointestinal", | |
" desenvolvimento", | |
" aproximadamente", | |
" responsabilidad", | |
" administrations", | |
" administrativos", | |
" administrativas", | |
" administration", | |
" administrative", | |
" administrators", | |
" correspondence", | |
" Administrative", | |
" exclusivamente", | |
" posteriormente", | |
" acompanhamento", | |
" administrativa", | |
" reconhecimento", | |
" possibilidades" | |
], | |
"Romanian; Moldavian; Moldovan": [ | |
" multidisciplinary", | |
" vulnerabilities", | |
" particularmente", | |
" infrastructures", | |
" infraestructura", | |
" extraordinarily", | |
" extracurricular", | |
" infrastructure", | |
" pharmaceutical", | |
" cardiovascular", | |
" Infrastructure", | |
" Pharmaceutical", | |
".Infrastructure", | |
" infraestrutura", | |
" susceptibility", | |
" manufacturer's", | |
".infrastructure", | |
" approximately", | |
" manufacturing", | |
" manufacturers" | |
], | |
"French": [ | |
" responsibilities", | |
" particulièrement", | |
" caractéristiques", | |
" entrepreneurship", | |
" professionnelles", | |
" unconstitutional", | |
" Entrepreneurship", | |
" recommendations", | |
" representations", | |
" environmentally", | |
" implementations", | |
" entrepreneurial", | |
" supplémentaires", | |
" electromagnetic", | |
" fonctionnalités", | |
" Recommendations", | |
" automatiquement", | |
" experimentation", | |
" malheureusement", | |
" essentiellement" | |
], | |
"Spanish; Castilian": [ | |
" cryptocurrencies", | |
" correspondientes", | |
" establecimientos", | |
" sustentabilidade", | |
" características", | |
" internacionales", | |
" correspondiente", | |
" respectivamente", | |
" automáticamente", | |
" establecimiento", | |
" recomendaciones", | |
" investigaciones", | |
" estabelecimento", | |
" disponibilidade", | |
" estadounidenses", | |
" funcionalidades", | |
" entretenimiento", | |
" específicamente", | |
" acontecimientos", | |
" especificamente" | |
], | |
"Russian": [ | |
" государственного", | |
" законодательства", | |
" непосредственно", | |
" государственной", | |
" воспользоваться", | |
" ответственность", | |
" ответственности", | |
" специализирован", | |
" Республикасынын", | |
" государственных", | |
" самостоятельно", | |
" осуществляется", | |
" использованием", | |
" дополнительные", | |
" соответственно", | |
" дополнительных", | |
" производителей", | |
" представителей", | |
" восстановления", | |
" использования" | |
], | |
"Danish": [ | |
" Namminersorlutik", | |
" aforementioned", | |
".minecraftforge", | |
" omstandigheden", | |
" transformative", | |
" Kommunikations", | |
" belangstelling", | |
"<|endofprompt|>", | |
" investigating", | |
" Kommunikation", | |
" investigative", | |
" predetermined", | |
" funktionieren", | |
" veranderingen", | |
" refrigeration", | |
" tegenstelling", | |
" refrigerators", | |
" thunderstorms", | |
" Telefonnummer", | |
" alternatives" | |
], | |
"Tamil": [ | |
" தெரிவித்துள்ளார்", | |
"ப்பட்டுள்ளது", | |
" செய்யப்பட்ட", | |
" பயன்படுத்த", | |
" தெரிவித்த", | |
"ிக்கப்பட்ட", | |
" இருக்கும்", | |
"ப்படுகிறது", | |
" தொடர்ந்து", | |
" செய்திகள்", | |
" பகுதியில்", | |
" குறிப்பிட", | |
" வேண்டும்", | |
"ங்களுக்கு", | |
" நிலையில்", | |
" நடைபெற்ற", | |
" வருகிறது", | |
" வருகின்ற", | |
" மருத்துவ", | |
" திரைப்பட" | |
], | |
"Afrikaans": [ | |
".springframework", | |
" dienstverlening", | |
" ontwikkelingen", | |
" neighborhoods", | |
" Naalakkersuis", | |
" disadvantages", | |
" ondersteuning", | |
" georganiseerd", | |
" naalakkersuis", | |
" disadvantaged", | |
" voorbereiding", | |
" beoordelingen", | |
" naapertorlugu", | |
" ontwikkeling", | |
" samenwerking", | |
" ingredientes", | |
".bootstrapcdn", | |
" disadvantage", | |
" strawberries", | |
" interviewing" | |
], | |
"Swahili": [ | |
" MERCHANTABILITY", | |
"MERCHANTABILITY", | |
" humanitarian", | |
" Saskatchewan", | |
".MILLISECONDS", | |
" Kombination", | |
" faʻaaogaina", | |
" Lamborghini", | |
" kuhakikisha", | |
" humiliation", | |
" WARRANTIES", | |
" utilizando", | |
" unfamiliar", | |
" mbalimbali", | |
" Kazakhstan", | |
" ransomware", | |
" whakahaere", | |
" MANAGEMENT", | |
" UNIVERSITY", | |
" sawijining" | |
], | |
"Catalan; Valencian": [ | |
" representatives", | |
" Pharmaceuticals", | |
" pharmaceuticals", | |
" establishments", | |
" circunstancias", | |
" experimentally", | |
" Environmental", | |
" developmental", | |
" institucional", | |
" perfectamente", | |
" absolutamente", | |
" experimenting", | |
" probablemente", | |
" transferencia", | |
" antibacterial", | |
" perfeitamente", | |
" irresponsible", | |
" eventualmente", | |
" participants", | |
" temperatures" | |
], | |
"Vietnamese": [ | |
" NONINFRINGEMENT", | |
"_NOTIFICATION", | |
"_TRANSACTION", | |
" NEGLIGENCE", | |
" Philosophy", | |
".HORIZONTAL", | |
" thumbnails", | |
"_HORIZONTAL", | |
"_NAMESPACE", | |
"_THRESHOLD", | |
"_TIMESTAMP", | |
".thumbnail", | |
"QRSTUVWXYZ", | |
" QUESTIONS", | |
".TRAILING", | |
"_TEMPLATE", | |
" ngwaahịa", | |
" 天天爱彩票app", | |
" trucking", | |
" QCOMPARE" | |
], | |
"Japanese": [ | |
" ", | |
"ありがとうございました", | |
"ありがとうございます", | |
"VIPがお送りします", | |
" ", | |
" 風吹けば名無し" | |
], | |
"Finnish": [ | |
" Naalakkersuisut", | |
" kontaktannonser", | |
" simultaneously", | |
" Sustainability", | |
" mantenimiento", | |
" assassination", | |
" availability", | |
" Pennsylvania", | |
" Transmission", | |
" similarities", | |
" simultaneous", | |
" illumination", | |
" paasissutiss", | |
" kristiansand", | |
" optimisation", | |
" Taamaattumik", | |
" assimilation", | |
" maintaining", | |
" essentially", | |
" sensitivity" | |
], | |
"Estonian": [ | |
" հնարավորություն", | |
" microorganisms", | |
" Հանրապետության", | |
" მნიშვნელოვანია", | |
" vulnerability", | |
" ներկայացուցիչ", | |
" მნიშვნელოვანი", | |
" misunderstood", | |
" განსაკუთრებით", | |
" յուրաքանչյուր", | |
" assigiinngits", | |
" անվտանգության", | |
" կառավարության", | |
" պաշտպանության", | |
" tunngatillugu", | |
" განმავლობაში", | |
" საქართველოში", | |
" longitudinal", | |
" საერთაშორისო", | |
" დაკავშირებით" | |
], | |
"Swedish": [ | |
" Charlottesville", | |
" investigators", | |
" demonstrating", | |
" Thanksgiving", | |
" inflammatory", | |
" transferring", | |
" Jacksonville", | |
"-inflammatory", | |
" veterinarian", | |
" transmitting", | |
" Shutterstock", | |
"<|endoftext|>", | |
" anniversary", | |
" marketplace", | |
" frustrating", | |
" fundraising", | |
" Anniversary", | |
" versatility", | |
".Transparent", | |
" personagens" | |
], | |
"Somali": [ | |
"XXXXXXXXXXXXXXXX", | |
" isumaqatigiiss", | |
" downloadable", | |
" Soomaaliyeed", | |
" Madaxweynaha", | |
" hexadecimal", | |
" inuiaqatigi", | |
" Soomaaliya", | |
" Somaliland", | |
"adaxweynaha", | |
" sababaraha", | |
" Madagascar", | |
" bookmarked", | |
" Wednesdays", | |
" dashboards", | |
" habilidade", | |
" Madaxweyne", | |
" Federaalka", | |
" dashboard", | |
" Dashboard" | |
], | |
"Turkish": [ | |
" Türkmenistanyň", | |
"Türkmenistanyň", | |
" bulunmaktadır", | |
" standardized", | |
" Üniversitesi", | |
" gerçekleştir", | |
" gatnaşyklary", | |
" Azərbaycanın", | |
" Türkmenistan", | |
" nakenbilder", | |
" mümkinçilik", | |
" Multiplayer", | |
" durability", | |
" vocabulary", | |
" tarafından", | |
"abilirsiniz", | |
" Disneyland", | |
" veterinary", | |
" içerisinde", | |
" tarapyndan" | |
], | |
"Bulgarian": [ | |
" характеристики", | |
" административ", | |
" представители", | |
" недвижимости", | |
" организации", | |
" заболевания", | |
" характерист", | |
" предприятия", | |
" организация", | |
" направления", | |
" организаций", | |
" мероприятия", | |
" предприятий", | |
" определения", | |
" заболевание", | |
" иазгәеиҭеит", | |
" направление", | |
" ограничения", | |
" материалов", | |
" предприним" | |
], | |
"Norwegian": [ | |
" indispensables", | |
"/settingsdialog", | |
" advertisement", | |
" indispensable", | |
" unforgettable", | |
" Advertisement", | |
" inadvertently", | |
" intelligently", | |
" representing", | |
"Advertisement", | |
" storytelling", | |
" spokesperson", | |
" kennenlernen", | |
" overeenkomst", | |
" transporting", | |
" supermarkets", | |
" instellingen", | |
" homelessness", | |
" programmable", | |
" selvfølgelig" | |
], | |
"Indonesian": [ | |
" Berdimuhamedow", | |
" INTERNATIONAL", | |
" Parliamentary", | |
" Indianapolis", | |
" INTERRUPTION", | |
" organisaties", | |
" distributing", | |
" meningkatkan", | |
" temperaturas", | |
" Berdimuhamed", | |
" menghasilkan", | |
" mangrupikeun", | |
" disappearing", | |
" Temperaturen", | |
" pertandingan", | |
" safeguarding", | |
" guaranteeing", | |
" temperaturen", | |
" perkembangan", | |
" benchmarking" | |
], | |
"Welsh": [ | |
" methodological", | |
" carbohydrates", | |
" methodologies", | |
" biotechnology", | |
" granddaughter", | |
" neighborhood", | |
" Additionally", | |
" additionally", | |
" Philadelphia", | |
" carbohydrate", | |
" bodybuilding", | |
" polyethylene", | |
" methodology", | |
" cellpadding", | |
" wonderfully", | |
" dynamically", | |
" cylindrical", | |
"ynchronously", | |
" withdrawing", | |
" cauliflower" | |
], | |
"Albanian": [ | |
" establishment", | |
" refurbishment", | |
" partnerships", | |
" Optimization", | |
" stripslashes", | |
" partnership", | |
" Maharashtra", | |
"Optimization", | |
" aktiviteter", | |
" diminishing", | |
" nourishment", | |
" mesmerizing", | |
" punishment", | |
" earthquake", | |
" repetitive", | |
" diminished", | |
" permitindo", | |
" mentorship", | |
" primitives", | |
" Algorithms" | |
], | |
"Polish": [ | |
" psychological", | |
" Psychological", | |
".spongepowered", | |
" technologies", | |
" synchronized", | |
" psychologist", | |
" policymakers", | |
"\tsynchronized", | |
" przedsiębior", | |
" synchronize", | |
" najbardziej", | |
" psychedelic", | |
" Polytechnic", | |
" psychologie", | |
" technician", | |
" trajectory", | |
"ynchronized", | |
" organizers", | |
" randomized", | |
" strtolower" | |
], | |
"Lithuanian": [ | |
" Administrator", | |
" parliamentary", | |
" administratie", | |
" surprisingly", | |
" testimonials", | |
"Administrator", | |
" spirituality", | |
" Testimonials", | |
" tragamonedas", | |
" propietarios", | |
" provisioning", | |
" proprietary", | |
" preliminary", | |
".mybatisplus", | |
"Testimonials", | |
" variability", | |
" movimientos", | |
" Respublikas", | |
" Australia's", | |
" alojamiento" | |
], | |
"Slovak": [ | |
" predominantly", | |
" psychologists", | |
"stackoverflow", | |
" planejamento", | |
" psychiatrist", | |
"sprechpartner", | |
" photovoltaic", | |
" proprietário", | |
" provenientes", | |
" psychiatric", | |
" proveedores", | |
" Observatory", | |
" provocative", | |
" predominant", | |
" tecnológica", | |
" tecnológico", | |
" chromosomes", | |
" innovatieve", | |
"providername", | |
" prehistoric" | |
], | |
"Croatian": [ | |
".Globalization", | |
".djangoproject", | |
" globalization", | |
" Coronavirus", | |
" projections", | |
" aprendizaje", | |
" informacije", | |
" pornography", | |
" pornografia", | |
"/javascript", | |
" projection", | |
" organizing", | |
" subjective", | |
"@Injectable", | |
" Injectable", | |
" personajes", | |
" javascript", | |
" projectile", | |
" optimizing", | |
" Innovative" | |
], | |
"Slovenian; Slovene": [ | |
" invokevirtual", | |
" preparedness", | |
" vrijblijvend", | |
" obligatoire", | |
" Tripadvisor", | |
" spokeswoman", | |
" predstavlja", | |
" zdravljenje", | |
"culoskeletal", | |
" developing", | |
" innovative", | |
" Observable", | |
" vulnerable", | |
" governance", | |
" inevitable", | |
" observable", | |
" privileged", | |
" preventive", | |
" obtainable", | |
".observable" | |
], | |
"Nepali": [ | |
" प्रधानमन्त्री", | |
" प्रधानमंत्री", | |
" प्रतिक्रिया", | |
" कार्यक्रममा", | |
" कार्यक्रम", | |
" स्वास्थ्य", | |
" सार्वजनिक", | |
" प्रक्रिया", | |
" सुनिश्चित", | |
" क्षेत्रमा", | |
" नगरपालिका", | |
" उम्मीदवार", | |
" मन्त्रालय", | |
" अनुसन्धान", | |
" नियन्त्रण", | |
" प्राकृतिक", | |
" कार्यालय", | |
" राजनीतिक", | |
" प्रतिनिध", | |
" सुरक्षित" | |
], | |
"Bengali; Bangla": [ | |
" প্রধানমন্ত্রী", | |
" বিশ্ববিদ্যাল", | |
" গুরুত্বপূর্ণ", | |
" বৃহস্পতিবার", | |
" আন্তর্জাতিক", | |
" বাংলাদেশের", | |
" সেপ্টেম্বর", | |
" চেয়ারম্যান", | |
" প্রতিষ্ঠান", | |
" স্বাস্থ্য", | |
" সিদ্ধান্ত", | |
" হাসপাতালে", | |
" কর্মকর্তা", | |
" জানিয়েছেন", | |
" প্রতিনিধি", | |
" বিস্তারিত", | |
" বাংলাদেশ", | |
" প্রতিষ্ঠ", | |
" বিরুদ্ধে", | |
" অনুষ্ঠিত" | |
], | |
"Czech": [ | |
"mnopqrstuvwxyz", | |
" polypropylene", | |
" technologie", | |
" synchronous", | |
" společnosti", | |
" JSONObject", | |
" objectives", | |
".JSONObject", | |
" disponível", | |
"otechnology", | |
" prototypes", | |
"\tJSONObject", | |
" prosperous", | |
"(JSONObject", | |
" problemlos", | |
".prototype", | |
" obviously", | |
"JSONObject", | |
" prototype", | |
" souhaitez" | |
], | |
"Hindi": [ | |
" विश्वविद्यालय", | |
" महत्वपूर्ण", | |
" अधिकारियों", | |
" खिलाड़ियों", | |
" पाकिस्तान", | |
" क्षेत्रों", | |
" जिन्होंने", | |
" वैज्ञानिक", | |
" निर्धारित", | |
" सकारात्मक", | |
" उन्होंने", | |
" कांग्रेस", | |
" प्रदर्शन", | |
" आवश्यकता", | |
" इस्तेमाल", | |
" प्रभावित", | |
" परिवर्तन", | |
" प्रस्तुत", | |
" उद्देश्य", | |
" शुक्रवार" | |
], | |
"Latvian": [ | |
" Infrastruktur", | |
".parametrize", | |
" Vietnamese", | |
" inspiratie", | |
"azzjonijiet", | |
" summarizes", | |
" bienvenida", | |
" galvanized", | |
" astronauts", | |
" atrocities", | |
" satisfied", | |
" varieties", | |
" gastronom", | |
" retrieval", | |
" satisfies", | |
" nepiecieš", | |
" Retrieval", | |
" previstas", | |
" Parkplatz", | |
" kunststof" | |
], | |
"Kannada": [ | |
" ಹಿನ್ನೆಲೆಯಲ್ಲಿ", | |
" ತಿಳಿಸಿದ್ದಾರೆ", | |
" ಸಂದರ್ಭದಲ್ಲಿ", | |
" ಮುಖ್ಯಮಂತ್ರಿ", | |
" ಮಾಡಿದ್ದಾರೆ", | |
" ಹೇಳಿದ್ದಾರೆ", | |
"ುತ್ತಿದ್ದಾರೆ", | |
" ನೀಡಿದ್ದಾರೆ", | |
" ಆಸ್ಪತ್ರೆಗೆ", | |
" ಕಾರ್ಯಕ್ರಮ", | |
" ಕಾಂಗ್ರೆಸ್", | |
" ವಿದ್ಯಾರ್ಥ", | |
" ಸಾರ್ವಜನಿಕ", | |
" ರಾಷ್ಟ್ರೀಯ", | |
"ಿಸಿದ್ದಾರೆ", | |
" ಸೇರಿದಂತೆ", | |
" ಬೆಂಗಳೂರು", | |
"ಪ್ರಜಾವಾಣಿ", | |
" ಅಭಿವೃದ್ಧ", | |
" ಚಿಕಿತ್ಸೆ" | |
], | |
"Malayalam": [ | |
" തിരുവനന്തപുരം", | |
"തിരുവനന്തപുരം", | |
" മുഖ്യമന്ത്രി", | |
"ിച്ചിട്ടുണ്ട്", | |
" വ്യക്തമാക്കി", | |
"ിരിക്കുന്നത്", | |
"ുകയായിരുന്നു", | |
" ബന്ധപ്പെട്ട", | |
" വിദ്യാഭ്യാസ", | |
" സെക്രട്ടറി", | |
" പ്രസിഡന്റ്", | |
"ിച്ചിരുന്നു", | |
" പഞ്ചായത്ത്", | |
" സംസാരിച്ചു", | |
" കോഴിക്കോട്", | |
" പങ്കെടുക്ക", | |
"പ്പെടുത്തിയ", | |
"ിക്കുന്നത്", | |
" അറിയിച്ചു", | |
"ുവനന്തപുരം" | |
], | |
"Hungarian": [ | |
" horizontally", | |
" sérstaklega", | |
" absolutely", | |
" horizontal", | |
" Azərbaycan", | |
" negatively", | |
" Absolutely", | |
" événements", | |
" volleyball", | |
".horizontal", | |
" kilómetros", | |
"-horizontal", | |
"(horizontal", | |
" metallurgy", | |
" verzamelen", | |
" magazines", | |
" polyester", | |
"horizontal", | |
" Polyester", | |
" verfügbar" | |
], | |
"Ukrainian": [ | |
" акоронавирус", | |
" ознакомиться", | |
" коронавирус", | |
" мемлекеттік", | |
" результатов", | |
" проститутки", | |
" документов", | |
" установить", | |
" обратиться", | |
" әлеуметтік", | |
" заниматься", | |
" внутренних", | |
" адміністра", | |
" використов", | |
" результата", | |
" результат", | |
" рекоменду", | |
" төхөөрөмж", | |
" документа", | |
" абсолютно" | |
], | |
"Telugu": [ | |
" చేస్తున్నారు", | |
"ిస్తున్నారు", | |
" మాట్లాడుతూ", | |
" సంబంధించిన", | |
" చేస్తున్న", | |
" కార్యక్రమ", | |
" ప్రభుత్వం", | |
" ప్రస్తుతం", | |
" తెలిసిందే", | |
" సందర్భంగా", | |
" హైదరాబాద్", | |
" నేపథ్యంలో", | |
" కాంగ్రెస్", | |
" అధికారులు", | |
" ప్రభుత్వ", | |
" తెలిపారు", | |
"ుకున్నారు", | |
" పోలీసులు", | |
" ప్రత్యేక", | |
" దర్శకత్వ" | |
], | |
"Macedonian": [ | |
" противопоказ", | |
" һөкүмитиниң", | |
" мамлекеттик", | |
" аанацҳауеит", | |
" предусматри", | |
" Тоҷикистон", | |
" достаточно", | |
" предназнач", | |
" инструмент", | |
" знакомства", | |
" предусмотр", | |
" профилакти", | |
" подготовки", | |
"диғанлиқини", | |
" икәнликини", | |
" сотрудники", | |
" еиҭеиҳәеит", | |
" қиливатқан", | |
" корпоратив", | |
" автомобили" | |
], | |
"Marathi; Marāṭhī": [ | |
" मुख्यमंत्री", | |
" राष्ट्रपति", | |
" महाराष्ट्र", | |
" विद्यार्थी", | |
" राष्ट्रीय", | |
" विद्यार्थ", | |
" राष्ट्रिय", | |
" व्यक्तिगत", | |
" अभिनेत्री", | |
" प्रशिक्षण", | |
" त्यांच्या", | |
" व्यवस्था", | |
" प्रत्येक", | |
" प्रस्ताव", | |
" कर्मचारी", | |
" गिरफ्तार", | |
" विद्यालय", | |
" निर्वाचन", | |
" वास्तविक", | |
" स्वतंत्र" | |
], | |
"Arabic": [ | |
" الإلكترونية", | |
" التكنولوجيا", | |
" الاجتماعية", | |
" الاقتصادية", | |
" الإلكتروني", | |
" الفلسطينية", | |
" الانتخابات", | |
" الإسرائيلي", | |
" الكهربائية", | |
" البريطانية", | |
" الإنجليزية", | |
" استراتيجية", | |
" الأمريكية", | |
" المعلومات", | |
" الاجتماعي", | |
" الإسلامية", | |
" الفلسطيني", | |
" المستخدمة", | |
" الاستثمار", | |
" المواطنين" | |
], | |
"Greek, Modern (1453-); Greek": [ | |
" περισσότερο", | |
" πραγματοποι", | |
" χρησιμοποι", | |
" αποτέλεσμα", | |
" χαρακτηρισ", | |
" περισσότε", | |
" περίπτωση", | |
" βρίσκεται", | |
" τελευταία", | |
" κατάσταση", | |
" κυβέρνηση", | |
" πρόγραμμα", | |
" δημιουργ", | |
" λειτουργ", | |
" συγκεκρι", | |
" μεγαλύτε", | |
" διάρκεια", | |
" υπάρχουν", | |
" αποτελεί", | |
" συνέχεια" | |
], | |
"Gujarati": [ | |
" વિદ્યાર્થીઓ", | |
" વિસ્તારમાં", | |
" વિદ્યાર્થી", | |
" કાર્યક્રમ", | |
" જિલ્લામાં", | |
" કાર્યવાહી", | |
" જણાવ્યું", | |
" હોસ્પિટલ", | |
" કોંગ્રેસ", | |
" જિલ્લાના", | |
" કરવામાં", | |
" દરમિયાન", | |
" આપવામાં", | |
" વ્યક્તિ", | |
" અમદાવાદ", | |
" સામાન્ય", | |
" ગુજરાતી", | |
" હોવાનું", | |
" જાહેરાત", | |
" કેન્દ્ર" | |
], | |
"Korean": [ | |
" 微信公众号天天中彩票", | |
" 微信上的天天中彩票", | |
" 微信里的天天中彩票", | |
" 天天中彩票大神推荐", | |
" 彩神争霸大发快三", | |
" 天天中彩票中大奖", | |
" 天天中彩票双色球", | |
"大发展有限公司官网", | |
" 天天中彩票不中返", | |
" 天天中彩票为什么", | |
" 天天中彩票nba", | |
" 天天彩票与你同行", | |
" 天天中彩票一等奖", | |
" 天天中彩票中奖了", | |
" 手机上天天中彩票", | |
" 天天中彩票是不是", | |
" 天天中彩票怎么买", | |
" 彩神争霸官方下载", | |
",最新高清无码专区", | |
" 天天中彩票APP" | |
], | |
"zh-cn": [ | |
"_日本毛片免费视频观看", | |
" 中国福利彩票天天", | |
"久久免费热在线精品", | |
" 微信的天天中彩票", | |
"无码不卡高清免费v", | |
" 大发快三大小单双", | |
"给主人留下些什么吧", | |
" qq的天天中彩票", | |
"_日本一级特黄大片", | |
" 大发快三开奖结果", | |
" 彩神争霸邀请码", | |
"免费视频在线观看", | |
"无码不卡高清免费", | |
"无码一区二区三区", | |
" 大发时时彩计划", | |
"】【:】【“】【", | |
" 大发时时彩开奖", | |
" 大发时时彩怎么", | |
" 彩神争霸电脑版", | |
" 大发快三是国家" | |
], | |
"Hebrew": [ | |
" פּראָדוקטן", | |
" פּראָדוק", | |
" האחרונות", | |
" באַקומען", | |
" באמצעות", | |
" געווארן", | |
" אונדזער", | |
" געשריבן", | |
" להתמודד", | |
" קסנומקס", | |
" הראשונה" | |
], | |
"Persian; Farsi": [ | |
" افغانستان", | |
" استخدامها", | |
" باستخدام", | |
" بنابراین", | |
" استفاده", | |
" استخدام", | |
" دانشگاه", | |
" درخواست", | |
" استخراج", | |
" خانواده", | |
" شهرستان", | |
" پاڪستان", | |
" اینترنت" | |
], | |
"Urdu": [ | |
" پاکستانی", | |
" وزیراعظم", | |
" مسلمانوں", | |
" بلوچستان", | |
" ہندوستان", | |
" پاکستان", | |
" استعمال", | |
" پروگرام", | |
" ځواکونو" | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[tool.poetry] | |
package-mode = false | |
[tool.poetry.dependencies] | |
python = "^3.11" | |
tiktoken = "^0.8.0" | |
langdetect = "^1.0.9" | |
iso639 = "^0.1.4" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment