Skip to content

Instantly share code, notes, and snippets.

@NTT123
Created February 22, 2019 10:37
Show Gist options
  • Save NTT123/e8cabe83430b76c8ce7c7bfc6c8ee474 to your computer and use it in GitHub Desktop.
Save NTT123/e8cabe83430b76c8ce7c7bfc6c8ee474 to your computer and use it in GitHub Desktop.
viet_news_text.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "viet_news_text.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/NTT123/e8cabe83430b76c8ce7c7bfc6c8ee474/viet_news_text.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"metadata": {
"id": "-tlXN-3decvE",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"### Vietnamese News Dataset\n",
"\n",
"17GB of text (5GB compressed) is at https://drive.google.com/open?id=1pJmE1N1LFzBZmZKdY2pdq7jaFn52s-7M"
]
},
{
"metadata": {
"id": "h4PRYs-orUid",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"## Stop early by pressing Ctrl + C\n",
"\n",
"import tqdm\n",
"from multiprocessing import Pool\n",
"\n",
"num = 50\n",
"start = 0\n",
"end = 30000000\n",
"delta = (end - start) // num\n",
"\n",
"import unicodedata\n",
"\n",
"\n",
"def download(start):\n",
" import urllib3\n",
" urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)\n",
" import requests\n",
"\n",
" f = open(\"{:09d}.txt\".format(start), \"w\")\n",
" ite = range(start, start + delta)\n",
" s = requests.Session()\n",
"\n",
" if start == (num//2)*delta:\n",
" ite = tqdm.tqdm(ite)\n",
"\n",
" for c in ite:\n",
" url = \"https://baomoi.com/c/{}.epi\".format(c)\n",
" try:\n",
" r = s.get(url, allow_redirects=False, timeout=100, verify=False)\n",
"\n",
" if r.status_code == 200:\n",
" idx1 = r.text.find('<div class=\"article\">')\n",
" idx2 = r.text.find('<div class=\"timeline\">')\n",
"\n",
" if idx1 != -1 and idx2 != -1:\n",
" f.write( unicodedata.normalize('NFC', r.text[idx1:idx2]) )\n",
" except:\n",
" pass\n",
"\n",
" return start, start+delta\n",
"\n",
"with Pool(num) as p:\n",
" print(p.map(download, range(start, end, delta)) )\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "xkyFn9z1riJ5",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"%%writefile cleaner.pl\n",
"\n",
"while (<>) {\n",
" s/{([^{}]|(?0))*}//g;\n",
" s/\\[([^\\[\\]]|(?0))*]//g;\n",
"\n",
" s/\\(([^()]|(?0))*\\)//g;\n",
"\n",
" s/<[^>]+>/ /g;\n",
" s/<\\/[^>]+>/ /g;\n",
" s/\"+/\"/g;\n",
" s/<[^>]+>[^<]*<\\/[^>]+>/ /g;\n",
" s/—/-/g;\n",
" s/[^\\|]+\\|([^\\|]+)/$1/g;\n",
" s/[^\\|]+\\|([^\\|]+)/$1/g;\n",
" s/[^\\|]+\\|([^\\|]+)/$1/g;\n",
" s/[^\\|]+\\|([^\\|]+)/$1/g;\n",
" s/^[ ]+//g;\n",
" s/[ ]+,/,/g;\n",
" s/[ ]+[.]/./g;\n",
" s/[ ]+ / /g;\n",
" s/…/.../g;\n",
" s/“/\"/g;\n",
" s/”/\"/g;\n",
"\n",
"\n",
" s/[\\-–\\xad−–─‒]/-/g;\n",
" s/[·]/./g;\n",
" s/[‟„«»]/\"/g;\n",
" s/[ ̊]/°/g;\n",
" s/[\\x80]/€/g;\n",
" s/&gt;//g;\n",
" s/&lt;//g;\n",
" s/[’‘]/'/g;\n",
" s/([*=;,&_$%?\\/!&@:£*])/$1/g;\n",
" s/[\\x13 \\(\\)\\+®># ±² н ≤ ¬ ÷ ▪ \\| ¦ ≥ \\{ \\} \\x{f098} \\x{200f} × ♀ ♂ ๑ ̉ ̀ ́ ̃ ̈ ̣ ̧ ^ © • з ● \\x99 \\x{f04a} → ♫ \\x92 \\~ \\x{200e} ‰ ™ ε ° ¸ ` \\[ \\] ½ º ³ ♥ � ¼ \\x10]/ /g; \n",
" s/[ ]/ /g;\n",
" s/[^! \" $ % & ' * , \\- . 0 1 2 3 4 5 6 7 8 \\/ 9 : ; < = ? @ A B C D E F G H I \\n J K L M N O P Q R S T U V W X Y Z \\\\ _ a b c d e f g h i j k l m n o p q r s t u v w x y z ¢ £ ¥ § µ À Á Â Ã Ä Å Ç È É Ê Ë Ì Í Î Ï Ð Ò Ó Ô Õ Ö Ø Ù Ú Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ñ ò ó ô õ ö ø ù ú û ü ý ÿ Ă ă ć Č č Đ đ ē ę ě ğ Ĩ ĩ ł ń ň ō ő œ ŕ Ř ř ŝ Ş ş Š š ţ ť Ũ ũ Ż Ž ž Ơ ơ Ư ư ǎ ǵ ǹ ɔ ə ʃ Δ Σ Φ Ω α β η μ π φ І А Б В Г Д Е Ж З И К Л М Н О П Р С Т У Х Ч Ш Э Я а б в г д е ж и й к л м о п р с т у ф х ц ч ш щ ы ь э ю я ё ஐ ღ Ḅ ḍ ḥ ṇ ṍ Ṭ Ạ ạ Ả ả Ấ ấ Ầ ầ Ẩ ẩ Ẫ ẫ Ậ ậ Ắ ắ Ằ ằ Ẳ ẳ Ẵ ẵ Ặ ặ Ẹ ẹ Ẻ ẻ Ẽ ẽ Ế ế Ề ề Ể ể Ễ ễ Ệ ệ Ỉ ỉ Ị ị Ọ ọ Ỏ ỏ Ố ố Ồ ồ Ổ ổ Ỗ ỗ Ộ ộ Ớ ớ Ờ ờ Ở ở Ỡ ỡ Ợ ợ Ụ ụ Ủ ủ Ứ ứ Ừ ừ Ử ử Ữ ữ Ự ự Ỳ ỳ Ỵ ỵ Ỷ ỷ Ỹ ỹ ]/ /g;\n",
"\n",
" s/[ \\t]+/ /g;\n",
"\n",
" print $_;\n",
"}\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "rQfpWRbMr6yq",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"%%writefile extract_text.py\n",
"\n",
"import glob\n",
"\n",
"for fn in glob.glob(\"./*.txt\"):\n",
" with open(fn, \"r\") as f:\n",
" flag = False\n",
" for line in f:\n",
" if line.find('<h1 class=\"article__header\">') != -1:\n",
" print(\"====\")\n",
" print(line[36:-6])\n",
" print()\n",
" continue\n",
"\n",
" if line.find('</div>') != -1:\n",
" flag = False\n",
" continue\n",
"\n",
" if line.find('<div class=\"article__sapo\">') != -1:\n",
" flag = True\n",
" continue\n",
"\n",
" if line.find('<div class=\"article__body\">') != -1:\n",
" flag = True\n",
" continue\n",
" \n",
" if flag:\n",
" if line.find(\"body-text\") == -1:\n",
" print(line, end=\"\")\n",
" continue\n",
"\n",
" if line.find(' <p class=\"body-image\">') == 0:\n",
" continue\n",
"\n",
"\n",
" for t in line.split('<p class=\"body-text\">'):\n",
" print(t.split('</p>')[0])\n",
" print()\n",
"\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "96b-kKMvr7tp",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"!python extract_text.py | perl -CSAD -Mutf8 cleaner.pl > baomoi.data"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "lbCow6hxr-nR",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"!head baomoi.data"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "9EU2D7GLr_9-",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"!ls -liah baomoi.data"
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment