Last active
June 30, 2022 05:31
-
-
Save dauuricus/36c360dbdeeb02bdc7251261b3dff80c to your computer and use it in GitHub Desktop.
binder notebook
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "9d57ff27-4c22-4189-b05d-8df11ac86f67", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#!pip install googletrans==4.0.0-rc1\n", | |
"!python -m pip install requests beautifulsoup4\n", | |
"!pip install deep-translator" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f183580d-3e75-4d0d-b7f7-be3bcba8ef79", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"url ='https://mybinder.readthedocs.io/en/latest/introduction.html#what-is-mybinder-org'\n", | |
"lang = 'ja'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "24b56ee3-a4ca-42bf-8ec8-7e6d347eb5b1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"##!pip install googletrans==4.0.0-rc1\n", | |
"##!pip install deep-translator\n", | |
"##!python -m pip install requests beautifulsoup4\n", | |
"\n", | |
"import requests,re\n", | |
"from bs4 import BeautifulSoup\n", | |
"from urllib.parse import urlparse\n", | |
"\n", | |
"#from googletrans import Translator\n", | |
"#translator = Translator()\n", | |
"\n", | |
"from deep_translator import GoogleTranslator\n", | |
"#translated = GoogleTranslator(source='auto', target='de').translate(\"keep it up, you are awesome\") # output -> Weiter so, du bist groΓartig\n", | |
"\n", | |
"uAgent = {'User-Agent': \"Mozilla/5.0 (Linux; Android 9) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36\",'Referer': 'https://www.google.com/'}\n", | |
"se = requests.Session()\n", | |
"res = se.get(url,headers=uAgent)\n", | |
"sch = urlparse(res.url).scheme\n", | |
"base = urlparse(res.url).netloc\n", | |
"import os\n", | |
"urldirname = os.path.dirname(res.url)\n", | |
"\n", | |
"import pathlib\n", | |
"basepath = pathlib.Path(res.url)\n", | |
"\n", | |
"#import os\n", | |
"#result = os.popen(\"curl -s \" + url).read()\n", | |
"#sch = urlparse(url).scheme\n", | |
"#base = urlparse(url).netloc\n", | |
"#soup = BeautifulSoup(result, \"html.parser\")\n", | |
"soup = BeautifulSoup(res.text, \"html.parser\")\n", | |
"res.close()\n", | |
"del se\n", | |
"\n", | |
"ptag_list_0 = soup.find_all('p')\n", | |
"h6tag_list_0 = soup.find_all('h6')\n", | |
"title_list_0 = soup.find_all('title')\n", | |
"print(title_list_0[0].text)\n", | |
"title = re.sub(r'\\s','_',title_list_0[0].text)\n", | |
"\n", | |
"nullch = 'π'\n", | |
"#nullch = 'π'\n", | |
"code_tag = soup.find_all('code')\n", | |
"code_counter = 0\n", | |
"code_contents = []\n", | |
"if len(code_tag) > 0:\n", | |
" for index,tag in enumerate(code_tag):\n", | |
" print(index,tag)\n", | |
" if re.match(r'\\<code(\\S|\\s).*?\\>',str(tag)):\n", | |
" code_contents.append(str(tag))\n", | |
" strip_tag = re.sub(r'\\<code(\\S|\\s)*?>',f\"{nullch}{code_counter}{nullch}\",str(tag))\n", | |
" strip_tag = re.sub(r'\\<\\/code\\>',f\"{nullch}{nullch}{code_counter}{nullch}{nullch}\",strip_tag)\n", | |
" code_tag[index].string = strip_tag\n", | |
" print(index,strip_tag)\n", | |
" code_counter += 1\n", | |
"\n", | |
"link = soup.find_all('link')\n", | |
"if len(link) > 0:\n", | |
" for index,v in enumerate(link):\n", | |
" if not v.has_attr('rel'):\n", | |
" continue\n", | |
"# print(index,v['rel'])\n", | |
" if v['rel'] == [\"stylesheet\"]:\n", | |
" #css location\n", | |
" #print(type(v))\n", | |
" if not v.has_attr('href'):\n", | |
" #if ('href') in v:\n", | |
" continue\n", | |
"# print(v['href'])\n", | |
" if (bool(re.match(r'^http',v['href']))==False):\n", | |
" print(v['href'])\n", | |
" if (bool(re.match(r'^\\/',v['href']))==True):\n", | |
"# link[index]['href'] = sch + \"://\" + base + v['href']\n", | |
" link[index]['href'] = urldirname + v['href']\n", | |
" else:\n", | |
" if re.match(r'^\\.',v['href']):\n", | |
" temp_work = pathlib.Path(str(basepath) + '/'+ v['href']).resolve()\n", | |
" link[index]['href'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n", | |
" else:\n", | |
"# link[index]['href'] = sch + \"://\" + base + '/' +v['href']\n", | |
" link[index]['href'] = urldirname + '/' +v['href']\n", | |
" print(link[index]['href'])\n", | |
"\n", | |
"image = soup.find_all('img')\n", | |
"if len(image) > 0:\n", | |
" for index,im in enumerate(image):\n", | |
"# print(index,im)\n", | |
" #if im['alt'] == \"Bellingcat\" or im['alt'] == \"GIJNlogo\":\n", | |
" if not im.has_attr('src'):\n", | |
" continue\n", | |
" if (bool(re.match(r'^http',im['src']))==False):\n", | |
" print(im['src'])\n", | |
" # image[index]['src'] = 'https://www.bellingcat.com' + im['src']\n", | |
" if (bool(re.match(r'^\\/',im['src']))==True):\n", | |
"# image[index]['src'] = sch + '://' + base + im['src']\n", | |
" image[index]['src'] = urldirname + '/' + im['src']\n", | |
" else:\n", | |
" if re.match(r'^\\.',im['src']):\n", | |
" temp_work = pathlib.Path(str(basepath) + '/'+ im['src']).resolve()\n", | |
" image[index]['src'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n", | |
" else:\n", | |
"# image[index]['src'] = sch + '://' + base + '/' + im['src']\n", | |
" image[index]['src'] = urldirname + '/' + im['src']\n", | |
" print(index,image[index]['src'])\n", | |
"\n", | |
"\n", | |
"import time\n", | |
"counter = 0\n", | |
"def trans(list0,translator,counter):\n", | |
"#def trans(list0,lang):\n", | |
" link_list = []\n", | |
" link_words_list = []\n", | |
"\n", | |
" for index,lines in enumerate(list0):\n", | |
" counter2 = counter\n", | |
" print()\n", | |
" print(index, lines)\n", | |
"# xxxx = lines.text.strip()\n", | |
" #xxxx1 = re.finditer(r'\\b((\\=|\\.|\\d|\\w|[ -;:,\"ββ\\'&\\?\\!\\.])*(?!([^<]*>)))',str(lines))\n", | |
"\n", | |
" #(?<=\\<p\\>)(.+)(?=\\<\\/p)\n", | |
" #(\\w|,|\\.|\\&|\\=|;|([ β-]))+(?!([^<]*>))\n", | |
"\n", | |
" soup2 = BeautifulSoup(str(lines), \"html.parser\")\n", | |
" a_link = soup2.find_all('a')\n", | |
" newtag = []\n", | |
" if len(a_link) > 0:\n", | |
" for i,v in enumerate(a_link):\n", | |
" #link_words = re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',str(v))\n", | |
" if v.has_attr('href'):\n", | |
" pass\n", | |
" else:\n", | |
" continue\n", | |
" link_href = v.get('href')\n", | |
" if (bool(re.search(r'^http',link_href))==False):\n", | |
" if (bool(re.match(r'^\\/',link_href))==True):\n", | |
" temp_work = pathlib.Path(str(basepath) + link_href).resolve()\n", | |
" link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n", | |
" #link_href = sch + '://' + base + link_href\n", | |
" else:\n", | |
" temp_work = pathlib.Path(str(basepath) + '\\/'+ link_href).resolve()\n", | |
" link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n", | |
" #link_href = sch + '://' + base + '/' + link_href\n", | |
" link_words = v.text\n", | |
" print()\n", | |
" print(\"words\",link_words)\n", | |
" print(\"a link:\",link_href)\n", | |
" link_list.append(link_href)\n", | |
" link_words_list.append(link_words)\n", | |
"\n", | |
" if len(link_words) > 0:\n", | |
" tag = soup.new_tag('a',href= link_href)\n", | |
" if link_words != '':\n", | |
" tag.string = link_words\n", | |
" elif link_words == False:\n", | |
" tag.string = str(link_href)\n", | |
" else:\n", | |
" tag.string = str(link_href)\n", | |
" newtag.append(tag)\n", | |
"\n", | |
" print(newtag)\n", | |
" xxxx0 = re.sub(r'\\<p\\>|\\<\\/p\\>','',str(lines))\n", | |
" xxxx1 = re.finditer(r'((\\.|\\d|\\w|&|\\=|[ \\/\\(\\)\\-;:,β%#+β¦|\"ββββ\\'&\\?\\!\\.])*(?!([^<]*>)))',xxxx0)\n", | |
" xxxx2 = \"\"\n", | |
" for word in xxxx1:\n", | |
" t = word[1]\n", | |
" xxxx2 += t + 'π'\n", | |
" print()\n", | |
" print(xxxx2)\n", | |
"\n", | |
"# mark_words = []\n", | |
"# mark_words2 = []\n", | |
"#\n", | |
"# link_addr = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(lines))\n", | |
"# if len(link_addr) > 0:\n", | |
"# atag = re.findall(r'(?<=\\<a).+?(?=\\<\\/a)',str(lines))\n", | |
"# print(atag)\n", | |
"# for a_text in atag:\n", | |
"# mark_words += re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',a_text)\n", | |
"# for v in mark_words:\n", | |
"# strvv = ' '.join(v)\n", | |
"# mark_words2.append(strvv.strip())\n", | |
"# print(\"words\",mark_words2)\n", | |
"# print('link:',link_addr)\n", | |
"\n", | |
" #xxxx3 = re.sub(r\"\\s{3,}\",' ',xxxx2.strip())\n", | |
" xxxx3 = re.sub(r\"π\",'',xxxx2.strip())\n", | |
"\n", | |
" print()\n", | |
" print(xxxx3)\n", | |
"\n", | |
"# if(re.match(r'\\w|\\β',xxxx) != None ):\n", | |
" if(re.match(r'\\w|\\β',xxxx3) != None ):\n", | |
" print()\n", | |
"# print(xxxx3)\n", | |
" #pattern match\n", | |
"# texts = re.sub(r'\\.\\s+','. ',xxxx)\n", | |
"# texts = re.sub(r'\\s{2}',' \\'',texts)\n", | |
" texts = xxxx3\n", | |
" texts = re.sub(r'\\s{2,}',' \\'',texts)\n", | |
" texts = re.sub(r'\\.\\s+','. ',texts)\n", | |
" texts = re.sub(r'\\?\\s+','? ',texts)\n", | |
" texts = re.sub(r'\\!\\s+','! ',texts)\n", | |
" texts = re.sub(r'\\,\\s+',', ',texts)\n", | |
" print()\n", | |
"# print(index, xxxx)\n", | |
" print(index, texts)\n", | |
" if len(newtag) > 0:\n", | |
" for link_v in newtag:\n", | |
" print('newtag text:',link_v.text)\n", | |
" print('newtag val:',link_v)\n", | |
" counter += 1\n", | |
" try:\n", | |
" texts = re.sub(rf\"{link_v.text}\",f\"βπ‘{link_v.text}π‘β¦β§{counter}β§βΈβ\",texts)\n", | |
"# texts = re.sub(rf\"{link_v.text}\",'<span class=\"e;notranslate\"e;>' + f\"π΅β½π‘β½βΈβ¦β¦{link_v.text}π‘β¦β¦β§{counter}β§βΈ\"+'</span>',texts)\n", | |
" print('texts :',texts)\n", | |
" except:\n", | |
" print('error')\n", | |
" texts = link_v.text\n", | |
"\n", | |
" try:\n", | |
" print()\n", | |
" print('translated:')\n", | |
"# translator = GoogleTranslator(source='auto', target=lang)\n", | |
" translated = translator.translate(text=texts)\n", | |
" print(index, translated)\n", | |
"# translated = translator.translate(str(texts), dest=lang)\n", | |
"# print(index, translated.text)\n", | |
" print('______________________________')\n", | |
"# list0[index].string = translated.text\n", | |
" list0[index].string = translated\n", | |
" if len(newtag) > 0:\n", | |
" for link in newtag:\n", | |
" counter2 += 1\n", | |
" div = soup.new_tag('div')\n", | |
" div.string = 'β¦linkβ§βΈ' + str(counter2) + ':'\n", | |
" div.append(link)\n", | |
" list0[index].append(div)\n", | |
"\n", | |
" except:\n", | |
"# time.sleep(5)\n", | |
" print('translated: fail')\n", | |
"\n", | |
" return link_list,link_words_list,soup\n", | |
"\n", | |
"translator = GoogleTranslator(source='auto', target=lang)\n", | |
"links1,word1,soup = trans(h6tag_list_0,translator,counter)\n", | |
"links2,word2,soup = trans(ptag_list_0,translator,counter)\n", | |
"del translator\n", | |
"#trans(ptag_list_0,lang)\n", | |
"#trans(h6tag_list_0,lang)\n", | |
"\n", | |
"links3 = []\n", | |
"if links1 != None and links2 != None:\n", | |
" links3 = links1 + links2\n", | |
"elif links1 != None:\n", | |
" links3 = links1\n", | |
"else:\n", | |
" pass\n", | |
"\n", | |
"word3 = []\n", | |
"if word1 != None and word2 != None:\n", | |
" word3 = word1 + word2\n", | |
"elif word1 != None:\n", | |
" word3 = word1\n", | |
"else:\n", | |
" pass\n", | |
"\n", | |
"metatag = soup.new_tag('meta')\n", | |
"metatag.attrs['charset'] = \"utf-8\"\n", | |
"soup.head.append(metatag)\n", | |
"\n", | |
"#import os\n", | |
"#filename = os.path.basename(url)\n", | |
"filename = title[0:6] + '.html'\n", | |
"filename = re.sub(r'\\/','_',filename)\n", | |
"\n", | |
"with open(filename, \"wb\") as f_output:\n", | |
" f_output.write(soup.prettify(\"utf-8\"))\n", | |
"\n", | |
"# βπ΅π‘β½βΈβ¦β¦ π‘β¦β¦β§ β§βΈ\n", | |
"\n", | |
"file = open(filename, \"r\", encoding='utf-8')\n", | |
"line_list = file.readlines()\n", | |
"newtext = \"\"\n", | |
"re_pattern = re.compile(r\"(π‘\\S+?π‘β¦β§\\S+?β§βΈ)\")\n", | |
"re_pattern2 = re.compile(r\"(β¦β§\\S+?β§βΈ)\")\n", | |
"for linebyline in line_list:\n", | |
" temp_1 = []\n", | |
" temp_2 = []\n", | |
" #a_link_num = re.findall(r'π‘\\S+?π‘β¦β§(\\d+?)β§βΈ',linebyline)\n", | |
" a_link_num = re.findall(r'π‘.*?π‘β¦β§(\\S+?)β§βΈ',linebyline)\n", | |
" if len(a_link_num) > 0:\n", | |
" temp_0 = []\n", | |
" line2 = linebyline\n", | |
" for i,v in enumerate(a_link_num):\n", | |
" if not v in temp_0:\n", | |
" temp_2.append(v)\n", | |
" temp_0.append(v)\n", | |
" print('a_link_num:',i,v)\n", | |
" num = int(v)\n", | |
"\n", | |
" #extract_words = re.finditer(r\"π‘(\\S+?)π‘β¦β§\\d+?β§βΈ\",linebyline)\n", | |
" extract_words = re.finditer(r\"π‘(\\S+?)π‘β¦β§\\S+?β§βΈ\",linebyline)\n", | |
"\n", | |
" if extract_words != None:\n", | |
" if num < len(links3):\n", | |
" for iew,w in enumerate(extract_words):\n", | |
" ws = str(w.group()) #link_words ...translated word\n", | |
" if not ws in temp_1:\n", | |
" temp_1.append(ws)\n", | |
" print(ws)\n", | |
" matc = re.findall(re_pattern,line2)\n", | |
" if len(matc) > 0:\n", | |
" for ms in matc:\n", | |
" if (ms.find(ws)) != -1:\n", | |
"\n", | |
" link_number = re.match(r'π‘\\S+?π‘β¦β§(?P<number>\\S+?)β§βΈ',ws)\n", | |
" #print('link_number:',link_number.groups()[0])\n", | |
" # linl_number.groups()[0] == link_number.group('number')\n", | |
" print('link_number:',link_number.group('number'))\n", | |
" number = int(link_number.groups()[0])\n", | |
" embed_link = str(links3[number - 1])\n", | |
" word = str(word3[number-1])\n", | |
" print('non skipped')\n", | |
" striped_ws = re.sub(r'π|βΈ|β¦|π‘|','',ws)\n", | |
" print(striped_ws)\n", | |
" if (bool(re.search(rf\"{ws}\",line2))==True):\n", | |
" print(line2)\n", | |
" line2 = line2.replace(ws,f\"<a href={embed_link}>{striped_ws}</a>\",1)\n", | |
" print(line2)\n", | |
" #line2 = re.sub(r'π|βΈ|β¦|π‘|','',line2)\n", | |
" break\n", | |
"\n", | |
" else:\n", | |
" print('skipped!!!')\n", | |
"\n", | |
" newtext += line2\n", | |
" else:\n", | |
" newtext += linebyline\n", | |
"\n", | |
"\n", | |
" #a_link_num2 = re.findall(r'β¦β§(\\d+?)β§βΈ',line2)\n", | |
" a_link_num2 = re.findall(r'β¦β§(\\S+?)β§βΈ',newtext)\n", | |
" if len(a_link_num2) > 0:\n", | |
" temp_0 = []\n", | |
" for i,v in enumerate(a_link_num2):\n", | |
" print('a_link_num2:',i,v)\n", | |
" if not v in temp_2:\n", | |
" print(temp_2)\n", | |
" if not v in temp_0:\n", | |
" temp_0.append(v)\n", | |
" print('a_link_num2:',i,v)\n", | |
" num = int(v)\n", | |
" extract_words2 = v\n", | |
" if extract_words2 != None:\n", | |
" if num < len(links3):\n", | |
" if not extract_words2 in temp_1:\n", | |
" temp_1.append(extract_words2)\n", | |
" print(extract_words2)\n", | |
" matc = re.findall(re_pattern2,newtext)\n", | |
" if len(matc) > 0:\n", | |
" for ms in matc:\n", | |
" if (ms.find(extract_words2)) != -1:\n", | |
"\n", | |
" link_number = num\n", | |
" print('link_number:',num)\n", | |
" embed_link = str(links3[num - 1])\n", | |
" word = str(word3[num - 1])\n", | |
" print('non skipped')\n", | |
" newtext= newtext.replace('β¦β§'+ extract_words2 + 'β§βΈ',f\"<a href={embed_link}>β¦β§{extract_words2}β§βΈ</a>\")\n", | |
" newtext = re.sub(r'π|βΈ|β¦|π‘|','',newtext)\n", | |
"\n", | |
" else:\n", | |
" print('skipped!!!')\n", | |
" \n", | |
" codetag = re.findall(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",newtext)\n", | |
" if len(codetag) > 0:\n", | |
" print('code found!')\n", | |
" for cv in codetag:\n", | |
" counter_num = re.match(rf\"{nullch}(\\d+?){nullch}\",str(cv))\n", | |
" print(counter_num)\n", | |
" match1 = counter_num.group(0)\n", | |
" i = re.sub(rf\"{nullch}\",'',match1)\n", | |
" print(\"i:\",i)\n", | |
" contents = code_contents[int(i)]\n", | |
" print('code:',contents)\n", | |
" if len(re.findall(rf\"{match1}\",cv)) != 2:\n", | |
" #text = re.sub(rf\"{match1}\",contents,str(cv))\n", | |
" #newtext = re.sub(rf\"{nullch}\\d+?{nullch}\",text,newtext,1)\n", | |
" continue\n", | |
" print(cv)\n", | |
" text = re.sub(rf\"^{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",contents,str(cv))\n", | |
" #text = re.sub(r'^π\\d+?π','<code>',str(cv))\n", | |
" #text = re.sub(r'ππ\\d+?ππ','</code>',str(text))\n", | |
" newtext = re.sub(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",text,newtext,1)\n", | |
" #newtext = re.sub(r'π\\d+?π.+?ππ\\d+?ππ',str(text),newtext,1)\n", | |
" newtext = re.sub(rf'({nullch}{nullch}\\d+?{nullch}{nullch})','',newtext)\n", | |
" newtext = re.sub(rf'({nullch}\\d+?{nullch})','',newtext)\n", | |
" newtext = re.sub(rf'({nullch}\\d+)','',newtext)\n", | |
" newtext = re.sub(rf'({nullch})','',newtext)\n", | |
"re.purge()\n", | |
"file.close()\n", | |
"\n", | |
"with open('generated.html', \"w+\", encoding='utf-8') as file:\n", | |
" file.write(newtext)\n", | |
"# π΅π‘β½βΈβ¦β¦ π‘β¦β¦β§ β§βΈ" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment