Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save sobamchan/1bb50c23de0e9b72eadfe60a9ae520a8 to your computer and use it in GitHub Desktop.
Save sobamchan/1bb50c23de0e9b72eadfe60a9ae520a8 to your computer and use it in GitHub Desktop.
from pathlib import Path
import tarfile
import wget
import pandas as pd
URL = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
SAVETO = Path("./livedoor-news-data.tar.gz")
DATASET_PATH = Path("dataset")
def main():
if not DATASET_PATH.exists():
filepath = wget.download(URL)
with tarfile.open(filepath) as fin:
fin.extractall(DATASET_PATH)
it_titles = []
dokujo_titles = []
for fpath in (DATASET_PATH / "text" / "it-life-hack").glob("it-*.txt"): # NOQA
it_titles.append((fpath.read_text().split("\n")[2]))
for fpath in (DATASET_PATH / "text" / "dokujo-tsushin").glob("dokujo-*.txt"): # NOQA
dokujo_titles.append((fpath.read_text().split("\n")[2]))
return pd.DataFrame({
"text": it_titles + dokujo_titles,
"label": [0] * len(it_titles) + [1] * len(dokujo_titles)
}).to_csv("./dataset.csv", index=False)
if __name__ == "__main__":
main()
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import sister
def main():
dataset = pd.read_csv("./dataset.csv")
texts, labels = zip(*dataset[["text", "label"]].values.tolist())
train_texts, test_texts, train_labels, test_labels =\
train_test_split(texts, labels, stratify=labels)
sentence_embedding = sister.MeanEmbedding(lang="ja")
train_x = np.array([sentence_embedding(t) for t in train_texts])
test_x = np.array([sentence_embedding(t) for t in test_texts])
clf = SVC(kernel="linear")
clf.fit(train_x, train_labels)
print(clf.score(test_x, test_labels))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment