Skip to content

Instantly share code, notes, and snippets.

@victorlin
Forked from ctlllll/longest_chinese_tokens_gpt4o.py
Last active October 26, 2024 21:40
Show Gist options
  • Save victorlin/2fa6ac60d257524f3ee26cfe5c471a08 to your computer and use it in GitHub Desktop.
Save victorlin/2fa6ac60d257524f3ee26cfe5c471a08 to your computer and use it in GitHub Desktop.
Longest tokens per language in gpt4o
import iso639
import json
import langdetect
import tiktoken
REQUIRED_LANGUAGES = ["zh-cn"]
# Minimum for required languages
# Maximum for optional languages
TOKENS_PER_LANGUAGE = 20
T = tiktoken.get_encoding("o200k_base")
tokens: list[tuple[int, int]] = []
for token_index in range(T.n_vocab):
try:
tokens.append((token_index, len(T.decode([token_index]))))
except:
pass
# Sort by length
tokens.sort(key=lambda item: -item[1])
longest_tokens: dict[str, list[str]] = {}
for token_index, _length in tokens:
token = T.decode([token_index])
try:
language = langdetect.detect(token)
language = iso639.to_name(language)
except langdetect.lang_detect_exception.LangDetectException:
# Unknown language, skip entirely
continue
except iso639.NonExistentLanguageError:
# Language code can't be converted to English name, just use the code
pass
if language not in longest_tokens:
print(f"New language: {language}")
longest_tokens[language] = []
if len(longest_tokens[language]) < TOKENS_PER_LANGUAGE:
longest_tokens[language].append(token)
if all(len(longest_tokens.get(language, [])) >= TOKENS_PER_LANGUAGE
for language in REQUIRED_LANGUAGES):
break
with open('2_longest_tokens.json', 'w') as json_file:
json.dump(longest_tokens, json_file, ensure_ascii=False, indent=2)
{
"Dutch; Flemish": [
"abcdefghijklmnopqrstuvwxyz",
" verantwoordelijkheid",
" verantwoordelijk",
" persoonsgegevens",
" maatschappelijke",
" Desenvolvimento",
" gespecialiseerd",
" investissements",
" oorspronkelijke",
" tentoonstelling",
" waarschijnlijk",
" renseignements",
" tegelijkertijd",
" internationaal",
" geïnteresseerd",
" investissement",
" independientes",
"’investissement",
" arrondissement",
" aantrekkelijke"
],
"Tagalog": [
"ABCDEFGHIJKLMNOPQRSTUVWXYZ",
"ABCDEFGHIJKLMNOP",
" sustainability",
" Rehabilitation",
" organisational",
" organisations",
" multinational",
" Compatibility",
" pinakamahusay",
" Organisations",
" organisation",
" Organisation",
" manipulation",
" Availability",
"’organisation",
"Compatibility",
" manipulating",
" Disabilities",
" longstanding",
" multilingual"
],
"Thai": [
" สำนักเลขานุการองค์กร",
" วิเคราะห์บอลวันนี้",
" แขวงคลองเตยเหนือ",
" นักลงทุนสัมพันธ์",
"เปิดอภิปรายทั่วไป",
" แสดงความคิดเห็น",
"【อ่านข้อความเต็ม",
" ถ่ายทอดสดฟุตบอล",
" อาคารจีเอ็มเอ็ม",
"อ่านข้อความเต็ม",
" กรุงเทพมหานครฯ",
" สำนักเลขานุการ",
" ฝ่ายขายออนไลน์",
"เติมเงินไทยฟรี",
" ส่งเงินบาทไทย",
"สดงความคิดเห็น",
" คาสิโนออนไลน์",
" กรุงเทพมหานคร",
" ติดต่อฝ่ายขาย",
" ฝ่ายขายรายการ"
],
"English": [
" telecommunications",
".onreadystatechange",
" Telecommunications",
".githubusercontent",
" htmlspecialchars",
" characterization",
" transformational",
" characteristics",
" straightforward",
" Redistributions",
" internationally",
" professionalism",
" accomplishments",
" troubleshooting",
" Congratulations",
" transformations",
" synchronization",
" instrumentation",
" functionalities",
" congratulations"
],
"German": [
" selbstverständlich",
" Wahrscheinlichkeit",
" unterschiedlichen",
" Herausforderungen",
" STDMETHODCALLTYPE",
" unterschiedliche",
" Dienstleistungen",
" misunderstanding",
" Veröffentlichung",
" Veranstaltungen",
" unterschiedlich",
" Schwangerschaft",
" beziehungsweise",
" Herausforderung",
" Voraussetzungen",
" Geschäftsführer",
" Bundesregierung",
" Digitalisierung",
" Ansprechpartner",
" Geschwindigkeit"
],
"Italian": [
" significativamente",
" disproportionately",
" interdisciplinary",
" commercialization",
" Responsibilities",
" interoperability",
" professionnelle",
" Representatives",
" differentiation",
" confidentiality",
" interpretations",
" internationalen",
" definitivamente",
" automaticamente",
" competitiveness",
" personalization",
" internationales",
" diversification",
" multiprocessing",
" classifications"
],
"Portuguese": [
" responsabilidades",
" responsabilidade",
" gastrointestinal",
" desenvolvimento",
" aproximadamente",
" responsabilidad",
" administrations",
" administrativos",
" administrativas",
" administration",
" administrative",
" administrators",
" correspondence",
" Administrative",
" exclusivamente",
" posteriormente",
" acompanhamento",
" administrativa",
" reconhecimento",
" possibilidades"
],
"Romanian; Moldavian; Moldovan": [
" multidisciplinary",
" vulnerabilities",
" particularmente",
" infrastructures",
" infraestructura",
" extraordinarily",
" extracurricular",
" infrastructure",
" pharmaceutical",
" cardiovascular",
" Infrastructure",
" Pharmaceutical",
".Infrastructure",
" infraestrutura",
" susceptibility",
" manufacturer's",
".infrastructure",
" approximately",
" manufacturing",
" manufacturers"
],
"French": [
" responsibilities",
" particulièrement",
" caractéristiques",
" entrepreneurship",
" professionnelles",
" unconstitutional",
" Entrepreneurship",
" recommendations",
" representations",
" environmentally",
" implementations",
" entrepreneurial",
" supplémentaires",
" electromagnetic",
" fonctionnalités",
" Recommendations",
" automatiquement",
" experimentation",
" malheureusement",
" essentiellement"
],
"Spanish; Castilian": [
" cryptocurrencies",
" correspondientes",
" establecimientos",
" sustentabilidade",
" características",
" internacionales",
" correspondiente",
" respectivamente",
" automáticamente",
" establecimiento",
" recomendaciones",
" investigaciones",
" estabelecimento",
" disponibilidade",
" estadounidenses",
" funcionalidades",
" entretenimiento",
" específicamente",
" acontecimientos",
" especificamente"
],
"Russian": [
" государственного",
" законодательства",
" непосредственно",
" государственной",
" воспользоваться",
" ответственность",
" ответственности",
" специализирован",
" Республикасынын",
" государственных",
" самостоятельно",
" осуществляется",
" использованием",
" дополнительные",
" соответственно",
" дополнительных",
" производителей",
" представителей",
" восстановления",
" использования"
],
"Danish": [
" Namminersorlutik",
" aforementioned",
".minecraftforge",
" omstandigheden",
" transformative",
" Kommunikations",
" belangstelling",
"<|endofprompt|>",
" investigating",
" Kommunikation",
" investigative",
" predetermined",
" funktionieren",
" veranderingen",
" refrigeration",
" tegenstelling",
" refrigerators",
" thunderstorms",
" Telefonnummer",
" alternatives"
],
"Tamil": [
" தெரிவித்துள்ளார்",
"ப்பட்டுள்ளது",
" செய்யப்பட்ட",
" பயன்படுத்த",
" தெரிவித்த",
"ிக்கப்பட்ட",
" இருக்கும்",
"ப்படுகிறது",
" தொடர்ந்து",
" செய்திகள்",
" பகுதியில்",
" குறிப்பிட",
" வேண்டும்",
"ங்களுக்கு",
" நிலையில்",
" நடைபெற்ற",
" வருகிறது",
" வருகின்ற",
" மருத்துவ",
" திரைப்பட"
],
"Afrikaans": [
".springframework",
" dienstverlening",
" ontwikkelingen",
" neighborhoods",
" Naalakkersuis",
" disadvantages",
" ondersteuning",
" georganiseerd",
" naalakkersuis",
" disadvantaged",
" voorbereiding",
" beoordelingen",
" naapertorlugu",
" ontwikkeling",
" samenwerking",
" ingredientes",
".bootstrapcdn",
" disadvantage",
" strawberries",
" interviewing"
],
"Swahili": [
" MERCHANTABILITY",
"MERCHANTABILITY",
" humanitarian",
" Saskatchewan",
".MILLISECONDS",
" Kombination",
" faʻaaogaina",
" Lamborghini",
" kuhakikisha",
" humiliation",
" WARRANTIES",
" utilizando",
" unfamiliar",
" mbalimbali",
" Kazakhstan",
" ransomware",
" whakahaere",
" MANAGEMENT",
" UNIVERSITY",
" sawijining"
],
"Catalan; Valencian": [
" representatives",
" Pharmaceuticals",
" pharmaceuticals",
" establishments",
" circunstancias",
" experimentally",
" Environmental",
" developmental",
" institucional",
" perfectamente",
" absolutamente",
" experimenting",
" probablemente",
" transferencia",
" antibacterial",
" perfeitamente",
" irresponsible",
" eventualmente",
" participants",
" temperatures"
],
"Vietnamese": [
" NONINFRINGEMENT",
"_NOTIFICATION",
"_TRANSACTION",
" NEGLIGENCE",
" Philosophy",
".HORIZONTAL",
" thumbnails",
"_HORIZONTAL",
"_NAMESPACE",
"_THRESHOLD",
"_TIMESTAMP",
".thumbnail",
"QRSTUVWXYZ",
" QUESTIONS",
".TRAILING",
"_TEMPLATE",
" ngwaahịa",
" 天天爱彩票app",
" trucking",
" QCOMPARE"
],
"Japanese": [
"                ",
"ありがとうございました",
"ありがとうございます",
"VIPがお送りします",
"        ",
" 風吹けば名無し"
],
"Finnish": [
" Naalakkersuisut",
" kontaktannonser",
" simultaneously",
" Sustainability",
" mantenimiento",
" assassination",
" availability",
" Pennsylvania",
" Transmission",
" similarities",
" simultaneous",
" illumination",
" paasissutiss",
" kristiansand",
" optimisation",
" Taamaattumik",
" assimilation",
" maintaining",
" essentially",
" sensitivity"
],
"Estonian": [
" հնարավորություն",
" microorganisms",
" Հանրապետության",
" მნიშვნელოვანია",
" vulnerability",
" ներկայացուցիչ",
" მნიშვნელოვანი",
" misunderstood",
" განსაკუთრებით",
" յուրաքանչյուր",
" assigiinngits",
" անվտանգության",
" կառավարության",
" պաշտպանության",
" tunngatillugu",
" განმავლობაში",
" საქართველოში",
" longitudinal",
" საერთაშორისო",
" დაკავშირებით"
],
"Swedish": [
" Charlottesville",
" investigators",
" demonstrating",
" Thanksgiving",
" inflammatory",
" transferring",
" Jacksonville",
"-inflammatory",
" veterinarian",
" transmitting",
" Shutterstock",
"<|endoftext|>",
" anniversary",
" marketplace",
" frustrating",
" fundraising",
" Anniversary",
" versatility",
".Transparent",
" personagens"
],
"Somali": [
"XXXXXXXXXXXXXXXX",
" isumaqatigiiss",
" downloadable",
" Soomaaliyeed",
" Madaxweynaha",
" hexadecimal",
" inuiaqatigi",
" Soomaaliya",
" Somaliland",
"adaxweynaha",
" sababaraha",
" Madagascar",
" bookmarked",
" Wednesdays",
" dashboards",
" habilidade",
" Madaxweyne",
" Federaalka",
" dashboard",
" Dashboard"
],
"Turkish": [
" Türkmenistanyň",
"Türkmenistanyň",
" bulunmaktadır",
" standardized",
" Üniversitesi",
" gerçekleştir",
" gatnaşyklary",
" Azərbaycanın",
" Türkmenistan",
" nakenbilder",
" mümkinçilik",
" Multiplayer",
" durability",
" vocabulary",
" tarafından",
"abilirsiniz",
" Disneyland",
" veterinary",
" içerisinde",
" tarapyndan"
],
"Bulgarian": [
" характеристики",
" административ",
" представители",
" недвижимости",
" организации",
" заболевания",
" характерист",
" предприятия",
" организация",
" направления",
" организаций",
" мероприятия",
" предприятий",
" определения",
" заболевание",
" иазгәеиҭеит",
" направление",
" ограничения",
" материалов",
" предприним"
],
"Norwegian": [
" indispensables",
"/settingsdialog",
" advertisement",
" indispensable",
" unforgettable",
" Advertisement",
" inadvertently",
" intelligently",
" representing",
"Advertisement",
" storytelling",
" spokesperson",
" kennenlernen",
" overeenkomst",
" transporting",
" supermarkets",
" instellingen",
" homelessness",
" programmable",
" selvfølgelig"
],
"Indonesian": [
" Berdimuhamedow",
" INTERNATIONAL",
" Parliamentary",
" Indianapolis",
" INTERRUPTION",
" organisaties",
" distributing",
" meningkatkan",
" temperaturas",
" Berdimuhamed",
" menghasilkan",
" mangrupikeun",
" disappearing",
" Temperaturen",
" pertandingan",
" safeguarding",
" guaranteeing",
" temperaturen",
" perkembangan",
" benchmarking"
],
"Welsh": [
" methodological",
" carbohydrates",
" methodologies",
" biotechnology",
" granddaughter",
" neighborhood",
" Additionally",
" additionally",
" Philadelphia",
" carbohydrate",
" bodybuilding",
" polyethylene",
" methodology",
" cellpadding",
" wonderfully",
" dynamically",
" cylindrical",
"ynchronously",
" withdrawing",
" cauliflower"
],
"Albanian": [
" establishment",
" refurbishment",
" partnerships",
" Optimization",
" stripslashes",
" partnership",
" Maharashtra",
"Optimization",
" aktiviteter",
" diminishing",
" nourishment",
" mesmerizing",
" punishment",
" earthquake",
" repetitive",
" diminished",
" permitindo",
" mentorship",
" primitives",
" Algorithms"
],
"Polish": [
" psychological",
" Psychological",
".spongepowered",
" technologies",
" synchronized",
" psychologist",
" policymakers",
"\tsynchronized",
" przedsiębior",
" synchronize",
" najbardziej",
" psychedelic",
" Polytechnic",
" psychologie",
" technician",
" trajectory",
"ynchronized",
" organizers",
" randomized",
" strtolower"
],
"Lithuanian": [
" Administrator",
" parliamentary",
" administratie",
" surprisingly",
" testimonials",
"Administrator",
" spirituality",
" Testimonials",
" tragamonedas",
" propietarios",
" provisioning",
" proprietary",
" preliminary",
".mybatisplus",
"Testimonials",
" variability",
" movimientos",
" Respublikas",
" Australia's",
" alojamiento"
],
"Slovak": [
" predominantly",
" psychologists",
"stackoverflow",
" planejamento",
" psychiatrist",
"sprechpartner",
" photovoltaic",
" proprietário",
" provenientes",
" psychiatric",
" proveedores",
" Observatory",
" provocative",
" predominant",
" tecnológica",
" tecnológico",
" chromosomes",
" innovatieve",
"providername",
" prehistoric"
],
"Croatian": [
".Globalization",
".djangoproject",
" globalization",
" Coronavirus",
" projections",
" aprendizaje",
" informacije",
" pornography",
" pornografia",
"/javascript",
" projection",
" organizing",
" subjective",
"@Injectable",
" Injectable",
" personajes",
" javascript",
" projectile",
" optimizing",
" Innovative"
],
"Slovenian; Slovene": [
" invokevirtual",
" preparedness",
" vrijblijvend",
" obligatoire",
" Tripadvisor",
" spokeswoman",
" predstavlja",
" zdravljenje",
"culoskeletal",
" developing",
" innovative",
" Observable",
" vulnerable",
" governance",
" inevitable",
" observable",
" privileged",
" preventive",
" obtainable",
".observable"
],
"Nepali": [
" प्रधानमन्त्री",
" प्रधानमंत्री",
" प्रतिक्रिया",
" कार्यक्रममा",
" कार्यक्रम",
" स्वास्थ्य",
" सार्वजनिक",
" प्रक्रिया",
" सुनिश्चित",
" क्षेत्रमा",
" नगरपालिका",
" उम्मीदवार",
" मन्त्रालय",
" अनुसन्धान",
" नियन्त्रण",
" प्राकृतिक",
" कार्यालय",
" राजनीतिक",
" प्रतिनिध",
" सुरक्षित"
],
"Bengali; Bangla": [
" প্রধানমন্ত্রী",
" বিশ্ববিদ্যাল",
" গুরুত্বপূর্ণ",
" বৃহস্পতিবার",
" আন্তর্জাতিক",
" বাংলাদেশের",
" সেপ্টেম্বর",
" চেয়ারম্যান",
" প্রতিষ্ঠান",
" স্বাস্থ্য",
" সিদ্ধান্ত",
" হাসপাতালে",
" কর্মকর্তা",
" জানিয়েছেন",
" প্রতিনিধি",
" বিস্তারিত",
" বাংলাদেশ",
" প্রতিষ্ঠ",
" বিরুদ্ধে",
" অনুষ্ঠিত"
],
"Czech": [
"mnopqrstuvwxyz",
" polypropylene",
" technologie",
" synchronous",
" společnosti",
" JSONObject",
" objectives",
".JSONObject",
" disponível",
"otechnology",
" prototypes",
"\tJSONObject",
" prosperous",
"(JSONObject",
" problemlos",
".prototype",
" obviously",
"JSONObject",
" prototype",
" souhaitez"
],
"Hindi": [
" विश्वविद्यालय",
" महत्वपूर्ण",
" अधिकारियों",
" खिलाड़ियों",
" पाकिस्तान",
" क्षेत्रों",
" जिन्होंने",
" वैज्ञानिक",
" निर्धारित",
" सकारात्मक",
" उन्होंने",
" कांग्रेस",
" प्रदर्शन",
" आवश्यकता",
" इस्तेमाल",
" प्रभावित",
" परिवर्तन",
" प्रस्तुत",
" उद्देश्य",
" शुक्रवार"
],
"Latvian": [
" Infrastruktur",
".parametrize",
" Vietnamese",
" inspiratie",
"azzjonijiet",
" summarizes",
" bienvenida",
" galvanized",
" astronauts",
" atrocities",
" satisfied",
" varieties",
" gastronom",
" retrieval",
" satisfies",
" nepiecieš",
" Retrieval",
" previstas",
" Parkplatz",
" kunststof"
],
"Kannada": [
" ಹಿನ್ನೆಲೆಯಲ್ಲಿ",
" ತಿಳಿಸಿದ್ದಾರೆ",
" ಸಂದರ್ಭದಲ್ಲಿ",
" ಮುಖ್ಯಮಂತ್ರಿ",
" ಮಾಡಿದ್ದಾರೆ",
" ಹೇಳಿದ್ದಾರೆ",
"ುತ್ತಿದ್ದಾರೆ",
" ನೀಡಿದ್ದಾರೆ",
" ಆಸ್ಪತ್ರೆಗೆ",
" ಕಾರ್ಯಕ್ರಮ",
" ಕಾಂಗ್ರೆಸ್",
" ವಿದ್ಯಾರ್ಥ",
" ಸಾರ್ವಜನಿಕ",
" ರಾಷ್ಟ್ರೀಯ",
"ಿಸಿದ್ದಾರೆ",
" ಸೇರಿದಂತೆ",
" ಬೆಂಗಳೂರು",
"ಪ್ರಜಾವಾಣಿ",
" ಅಭಿವೃದ್ಧ",
" ಚಿಕಿತ್ಸೆ"
],
"Malayalam": [
" തിരുവനന്തപുരം",
"തിരുവനന്തപുരം",
" മുഖ്യമന്ത്രി",
"ിച്ചിട്ടുണ്ട്",
" വ്യക്തമാക്കി",
"ിരിക്കുന്നത്",
"ുകയായിരുന്നു",
" ബന്ധപ്പെട്ട",
" വിദ്യാഭ്യാസ",
" സെക്രട്ടറി",
" പ്രസിഡന്റ്",
"ിച്ചിരുന്നു",
" പഞ്ചായത്ത്",
" സംസാരിച്ചു",
" കോഴിക്കോട്",
" പങ്കെടുക്ക",
"പ്പെടുത്തിയ",
"ിക്കുന്നത്",
" അറിയിച്ചു",
"ുവനന്തപുരം"
],
"Hungarian": [
" horizontally",
" sérstaklega",
" absolutely",
" horizontal",
" Azərbaycan",
" negatively",
" Absolutely",
" événements",
" volleyball",
".horizontal",
" kilómetros",
"-horizontal",
"(horizontal",
" metallurgy",
" verzamelen",
" magazines",
" polyester",
"horizontal",
" Polyester",
" verfügbar"
],
"Ukrainian": [
" акоронавирус",
" ознакомиться",
" коронавирус",
" мемлекеттік",
" результатов",
" проститутки",
" документов",
" установить",
" обратиться",
" әлеуметтік",
" заниматься",
" внутренних",
" адміністра",
" використов",
" результата",
" результат",
" рекоменду",
" төхөөрөмж",
" документа",
" абсолютно"
],
"Telugu": [
" చేస్తున్నారు",
"ిస్తున్నారు",
" మాట్లాడుతూ",
" సంబంధించిన",
" చేస్తున్న",
" కార్యక్రమ",
" ప్రభుత్వం",
" ప్రస్తుతం",
" తెలిసిందే",
" సందర్భంగా",
" హైదరాబాద్",
" నేపథ్యంలో",
" కాంగ్రెస్",
" అధికారులు",
" ప్రభుత్వ",
" తెలిపారు",
"ుకున్నారు",
" పోలీసులు",
" ప్రత్యేక",
" దర్శకత్వ"
],
"Macedonian": [
" противопоказ",
" һөкүмитиниң",
" мамлекеттик",
" аанацҳауеит",
" предусматри",
" Тоҷикистон",
" достаточно",
" предназнач",
" инструмент",
" знакомства",
" предусмотр",
" профилакти",
" подготовки",
"диғанлиқини",
" икәнликини",
" сотрудники",
" еиҭеиҳәеит",
" қиливатқан",
" корпоратив",
" автомобили"
],
"Marathi; Marāṭhī": [
" मुख्यमंत्री",
" राष्ट्रपति",
" महाराष्ट्र",
" विद्यार्थी",
" राष्ट्रीय",
" विद्यार्थ",
" राष्ट्रिय",
" व्यक्तिगत",
" अभिनेत्री",
" प्रशिक्षण",
" त्यांच्या",
" व्यवस्था",
" प्रत्येक",
" प्रस्ताव",
" कर्मचारी",
" गिरफ्तार",
" विद्यालय",
" निर्वाचन",
" वास्तविक",
" स्वतंत्र"
],
"Arabic": [
" الإلكترونية",
" التكنولوجيا",
" الاجتماعية",
" الاقتصادية",
" الإلكتروني",
" الفلسطينية",
" الانتخابات",
" الإسرائيلي",
" الكهربائية",
" البريطانية",
" الإنجليزية",
" استراتيجية",
" الأمريكية",
" المعلومات",
" الاجتماعي",
" الإسلامية",
" الفلسطيني",
" المستخدمة",
" الاستثمار",
" المواطنين"
],
"Greek, Modern (1453-); Greek": [
" περισσότερο",
" πραγματοποι",
" χρησιμοποι",
" αποτέλεσμα",
" χαρακτηρισ",
" περισσότε",
" περίπτωση",
" βρίσκεται",
" τελευταία",
" κατάσταση",
" κυβέρνηση",
" πρόγραμμα",
" δημιουργ",
" λειτουργ",
" συγκεκρι",
" μεγαλύτε",
" διάρκεια",
" υπάρχουν",
" αποτελεί",
" συνέχεια"
],
"Gujarati": [
" વિદ્યાર્થીઓ",
" વિસ્તારમાં",
" વિદ્યાર્થી",
" કાર્યક્રમ",
" જિલ્લામાં",
" કાર્યવાહી",
" જણાવ્યું",
" હોસ્પિટલ",
" કોંગ્રેસ",
" જિલ્લાના",
" કરવામાં",
" દરમિયાન",
" આપવામાં",
" વ્યક્તિ",
" અમદાવાદ",
" સામાન્ય",
" ગુજરાતી",
" હોવાનું",
" જાહેરાત",
" કેન્દ્ર"
],
"Korean": [
" 微信公众号天天中彩票",
" 微信上的天天中彩票",
" 微信里的天天中彩票",
" 天天中彩票大神推荐",
" 彩神争霸大发快三",
" 天天中彩票中大奖",
" 天天中彩票双色球",
"大发展有限公司官网",
" 天天中彩票不中返",
" 天天中彩票为什么",
" 天天中彩票nba",
" 天天彩票与你同行",
" 天天中彩票一等奖",
" 天天中彩票中奖了",
" 手机上天天中彩票",
" 天天中彩票是不是",
" 天天中彩票怎么买",
" 彩神争霸官方下载",
",最新高清无码专区",
" 天天中彩票APP"
],
"zh-cn": [
"_日本毛片免费视频观看",
" 中国福利彩票天天",
"久久免费热在线精品",
" 微信的天天中彩票",
"无码不卡高清免费v",
" 大发快三大小单双",
"给主人留下些什么吧",
" qq的天天中彩票",
"_日本一级特黄大片",
" 大发快三开奖结果",
" 彩神争霸邀请码",
"免费视频在线观看",
"无码不卡高清免费",
"无码一区二区三区",
" 大发时时彩计划",
"】【:】【“】【",
" 大发时时彩开奖",
" 大发时时彩怎么",
" 彩神争霸电脑版",
" 大发快三是国家"
],
"Hebrew": [
" פּראָדוקטן",
" פּראָדוק",
" האחרונות",
" באַקומען",
" באמצעות",
" געווארן",
" אונדזער",
" געשריבן",
" להתמודד",
" קסנומקס",
" הראשונה"
],
"Persian; Farsi": [
" افغانستان",
" استخدامها",
" باستخدام",
" بنابراین",
" استفاده",
" استخدام",
" دانشگاه",
" درخواست",
" استخراج",
" خانواده",
" شهرستان",
" پاڪستان",
" اینترنت"
],
"Urdu": [
" پاکستانی",
" وزیراعظم",
" مسلمانوں",
" بلوچستان",
" ہندوستان",
" پاکستان",
" استعمال",
" پروگرام",
" ځواکونو"
]
}
[tool.poetry]
package-mode = false
[tool.poetry.dependencies]
python = "^3.11"
tiktoken = "^0.8.0"
langdetect = "^1.0.9"
iso639 = "^0.1.4"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment