Created
November 25, 2019 03:42
-
-
Save edison7500/90df0614ef579d8fd039835786f67c0f to your computer and use it in GitHub Desktop.
利用 sklearn 计算文本 tf-idf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import shutil | |
import tempfile | |
from datetime import datetime, timedelta | |
import jieba | |
import pandas as pd | |
from django.conf import settings | |
from django.core.management.base import BaseCommand | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from apps.articles.models import Article, get_stop_words | |
idf_path = getattr(settings, "IDF_PATH", None) | |
def jieba_token(value): | |
return jieba.cut(value, cut_all=False) | |
class Command(BaseCommand): | |
help = "gen article tfidf file" | |
docs = list() | |
def add_arguments(self, parser): | |
parser.add_argument("-d", "--delta", type=int, help="cal", default=30) | |
def handle(self, *args, **options): | |
days = options["delta"] | |
since = datetime.now() - timedelta(days=days) | |
for row in Article.objects(published_at__gte=since): | |
self.docs.append(row.process_content()) | |
count_vect = CountVectorizer(tokenizer=jieba_token, min_df=1, stop_words=get_stop_words()) | |
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) | |
x_counts = count_vect.fit_transform(self.docs) | |
tfidf_transformer.fit_transform(x_counts) | |
df = pd.DataFrame(tfidf_transformer.idf_, index=count_vect.get_feature_names(), columns=["idf_weights"]) | |
fp = tempfile.NamedTemporaryFile() | |
for row in df.sort_values("idf_weights", ascending=False).itertuples(): | |
_ = "%s %.9f\n" % (row[0], row[1]) | |
fp.write(bytes(_.encode("utf-8"))) | |
shutil.copy(src=fp.name, dst=idf_path) | |
fp.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment