Skip to content

Instantly share code, notes, and snippets.

...
rancher_kubernetes_engine_config:
...
addons: |-
---
apiVersion: v1
stringData:
token: <YOUR-HETZNER-API-TOKEN>
kind: Secret
metadata:
apiVersion: v1
kind: Service
metadata:
annotations:
load-balancer.hetzner.cloud/health-check-port: "<YOUR-INGRESS-HEALTH-PORT: i.e 31902>"
load-balancer.hetzner.cloud/name: "<YOUR-LB-NAME>"
spec:
clusterIP: <Internal-IP>
externalTrafficPolicy: Local
healthCheckNodePort: 30787
perplexities = []
coherence = []
num_topics = [3,4, 10, 20] + list(range(5, 75, 10))
for nt in tqdm(num_topics):
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=words,
num_topics=nt,
random_state=2,
update_every=1,
passes=10,
import gensim.corpora as corpora
words = corpora.Dictionary(docs)
corpus = [words.doc2bow(doc) for doc in docs]
docs = []
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)
CUSTOM_STOP_WORDS = {'commit', 'github', 'pdf', 'download', 'desktop', '$', '|', '\\', '/', '#'}
nlp.Defaults.stop_words |= CUSTOM_STOP_WORDS
def lemmatizer(doc):
"""
This takes in a doc of tokens from the NER and lemmatizes them.
Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
"""
doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
doc = u' '.join(doc)
return nlp.make_doc(doc)
def remove_stopwords(doc):
def html2ScrapedWeb(url: str, html: str) -> ScrapedWebVitamined:
"""
Parse HTML using BS4 HTML5Lib parser and get <body> content without
<nav>, <script>, <footer>
It's focus on content
"""
dom = BeautifulSoup(html, 'html5lib')
# 1. Get title
title = dom.title.string if dom.title else None
# 2. Get description
class ScrapedWeb(object):
"""
Scraped web (POJO)
"""
def __init__(self, url: str, title: str, description: str, headings: List[str], contents: List[str], dom: BeautifulSoup):
self.url = url
self.title = title
self.description = description
self.headings = headings