Skip to content

Instantly share code, notes, and snippets.

@dauuricus
Last active June 30, 2022 05:31
Show Gist options
  • Save dauuricus/36c360dbdeeb02bdc7251261b3dff80c to your computer and use it in GitHub Desktop.
Save dauuricus/36c360dbdeeb02bdc7251261b3dff80c to your computer and use it in GitHub Desktop.
binder notebook
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9d57ff27-4c22-4189-b05d-8df11ac86f67",
"metadata": {},
"outputs": [],
"source": [
"#!pip install googletrans==4.0.0-rc1\n",
"!python -m pip install requests beautifulsoup4\n",
"!pip install deep-translator"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f183580d-3e75-4d0d-b7f7-be3bcba8ef79",
"metadata": {},
"outputs": [],
"source": [
"url ='https://mybinder.readthedocs.io/en/latest/introduction.html#what-is-mybinder-org'\n",
"lang = 'ja'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24b56ee3-a4ca-42bf-8ec8-7e6d347eb5b1",
"metadata": {},
"outputs": [],
"source": [
"##!pip install googletrans==4.0.0-rc1\n",
"##!pip install deep-translator\n",
"##!python -m pip install requests beautifulsoup4\n",
"\n",
"import requests,re\n",
"from bs4 import BeautifulSoup\n",
"from urllib.parse import urlparse\n",
"\n",
"#from googletrans import Translator\n",
"#translator = Translator()\n",
"\n",
"from deep_translator import GoogleTranslator\n",
"#translated = GoogleTranslator(source='auto', target='de').translate(\"keep it up, you are awesome\") # output -> Weiter so, du bist großartig\n",
"\n",
"uAgent = {'User-Agent': \"Mozilla/5.0 (Linux; Android 9) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36\",'Referer': 'https://www.google.com/'}\n",
"se = requests.Session()\n",
"res = se.get(url,headers=uAgent)\n",
"sch = urlparse(res.url).scheme\n",
"base = urlparse(res.url).netloc\n",
"import os\n",
"urldirname = os.path.dirname(res.url)\n",
"\n",
"import pathlib\n",
"basepath = pathlib.Path(res.url)\n",
"\n",
"#import os\n",
"#result = os.popen(\"curl -s \" + url).read()\n",
"#sch = urlparse(url).scheme\n",
"#base = urlparse(url).netloc\n",
"#soup = BeautifulSoup(result, \"html.parser\")\n",
"soup = BeautifulSoup(res.text, \"html.parser\")\n",
"res.close()\n",
"del se\n",
"\n",
"ptag_list_0 = soup.find_all('p')\n",
"h6tag_list_0 = soup.find_all('h6')\n",
"title_list_0 = soup.find_all('title')\n",
"print(title_list_0[0].text)\n",
"title = re.sub(r'\\s','_',title_list_0[0].text)\n",
"\n",
"nullch = '𓄃'\n",
"#nullch = '𓂃'\n",
"code_tag = soup.find_all('code')\n",
"code_counter = 0\n",
"code_contents = []\n",
"if len(code_tag) > 0:\n",
" for index,tag in enumerate(code_tag):\n",
" print(index,tag)\n",
" if re.match(r'\\<code(\\S|\\s).*?\\>',str(tag)):\n",
" code_contents.append(str(tag))\n",
" strip_tag = re.sub(r'\\<code(\\S|\\s)*?>',f\"{nullch}{code_counter}{nullch}\",str(tag))\n",
" strip_tag = re.sub(r'\\<\\/code\\>',f\"{nullch}{nullch}{code_counter}{nullch}{nullch}\",strip_tag)\n",
" code_tag[index].string = strip_tag\n",
" print(index,strip_tag)\n",
" code_counter += 1\n",
"\n",
"link = soup.find_all('link')\n",
"if len(link) > 0:\n",
" for index,v in enumerate(link):\n",
" if not v.has_attr('rel'):\n",
" continue\n",
"# print(index,v['rel'])\n",
" if v['rel'] == [\"stylesheet\"]:\n",
" #css location\n",
" #print(type(v))\n",
" if not v.has_attr('href'):\n",
" #if ('href') in v:\n",
" continue\n",
"# print(v['href'])\n",
" if (bool(re.match(r'^http',v['href']))==False):\n",
" print(v['href'])\n",
" if (bool(re.match(r'^\\/',v['href']))==True):\n",
"# link[index]['href'] = sch + \"://\" + base + v['href']\n",
" link[index]['href'] = urldirname + v['href']\n",
" else:\n",
" if re.match(r'^\\.',v['href']):\n",
" temp_work = pathlib.Path(str(basepath) + '/'+ v['href']).resolve()\n",
" link[index]['href'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
" else:\n",
"# link[index]['href'] = sch + \"://\" + base + '/' +v['href']\n",
" link[index]['href'] = urldirname + '/' +v['href']\n",
" print(link[index]['href'])\n",
"\n",
"image = soup.find_all('img')\n",
"if len(image) > 0:\n",
" for index,im in enumerate(image):\n",
"# print(index,im)\n",
" #if im['alt'] == \"Bellingcat\" or im['alt'] == \"GIJNlogo\":\n",
" if not im.has_attr('src'):\n",
" continue\n",
" if (bool(re.match(r'^http',im['src']))==False):\n",
" print(im['src'])\n",
" # image[index]['src'] = 'https://www.bellingcat.com' + im['src']\n",
" if (bool(re.match(r'^\\/',im['src']))==True):\n",
"# image[index]['src'] = sch + '://' + base + im['src']\n",
" image[index]['src'] = urldirname + '/' + im['src']\n",
" else:\n",
" if re.match(r'^\\.',im['src']):\n",
" temp_work = pathlib.Path(str(basepath) + '/'+ im['src']).resolve()\n",
" image[index]['src'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
" else:\n",
"# image[index]['src'] = sch + '://' + base + '/' + im['src']\n",
" image[index]['src'] = urldirname + '/' + im['src']\n",
" print(index,image[index]['src'])\n",
"\n",
"\n",
"import time\n",
"counter = 0\n",
"def trans(list0,translator,counter):\n",
"#def trans(list0,lang):\n",
" link_list = []\n",
" link_words_list = []\n",
"\n",
" for index,lines in enumerate(list0):\n",
" counter2 = counter\n",
" print()\n",
" print(index, lines)\n",
"# xxxx = lines.text.strip()\n",
" #xxxx1 = re.finditer(r'\\b((\\=|\\.|\\d|\\w|[ -;:,\"β€œβ€™\\'&\\?\\!\\.])*(?!([^<]*>)))',str(lines))\n",
"\n",
" #(?<=\\<p\\>)(.+)(?=\\<\\/p)\n",
" #(\\w|,|\\.|\\&|\\=|;|([ β€”-]))+(?!([^<]*>))\n",
"\n",
" soup2 = BeautifulSoup(str(lines), \"html.parser\")\n",
" a_link = soup2.find_all('a')\n",
" newtag = []\n",
" if len(a_link) > 0:\n",
" for i,v in enumerate(a_link):\n",
" #link_words = re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',str(v))\n",
" if v.has_attr('href'):\n",
" pass\n",
" else:\n",
" continue\n",
" link_href = v.get('href')\n",
" if (bool(re.search(r'^http',link_href))==False):\n",
" if (bool(re.match(r'^\\/',link_href))==True):\n",
" temp_work = pathlib.Path(str(basepath) + link_href).resolve()\n",
" link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
" #link_href = sch + '://' + base + link_href\n",
" else:\n",
" temp_work = pathlib.Path(str(basepath) + '\\/'+ link_href).resolve()\n",
" link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
" #link_href = sch + '://' + base + '/' + link_href\n",
" link_words = v.text\n",
" print()\n",
" print(\"words\",link_words)\n",
" print(\"a link:\",link_href)\n",
" link_list.append(link_href)\n",
" link_words_list.append(link_words)\n",
"\n",
" if len(link_words) > 0:\n",
" tag = soup.new_tag('a',href= link_href)\n",
" if link_words != '':\n",
" tag.string = link_words\n",
" elif link_words == False:\n",
" tag.string = str(link_href)\n",
" else:\n",
" tag.string = str(link_href)\n",
" newtag.append(tag)\n",
"\n",
" print(newtag)\n",
" xxxx0 = re.sub(r'\\<p\\>|\\<\\/p\\>','',str(lines))\n",
" xxxx1 = re.finditer(r'((\\.|\\d|\\w|&|\\=|[ \\/\\(\\)\\-;:,β”œ%#+…|\"β€œβ€™β€˜β€\\'&\\?\\!\\.])*(?!([^<]*>)))',xxxx0)\n",
" xxxx2 = \"\"\n",
" for word in xxxx1:\n",
" t = word[1]\n",
" xxxx2 += t + 'π“‚€'\n",
" print()\n",
" print(xxxx2)\n",
"\n",
"# mark_words = []\n",
"# mark_words2 = []\n",
"#\n",
"# link_addr = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(lines))\n",
"# if len(link_addr) > 0:\n",
"# atag = re.findall(r'(?<=\\<a).+?(?=\\<\\/a)',str(lines))\n",
"# print(atag)\n",
"# for a_text in atag:\n",
"# mark_words += re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',a_text)\n",
"# for v in mark_words:\n",
"# strvv = ' '.join(v)\n",
"# mark_words2.append(strvv.strip())\n",
"# print(\"words\",mark_words2)\n",
"# print('link:',link_addr)\n",
"\n",
" #xxxx3 = re.sub(r\"\\s{3,}\",' ',xxxx2.strip())\n",
" xxxx3 = re.sub(r\"π“‚€\",'',xxxx2.strip())\n",
"\n",
" print()\n",
" print(xxxx3)\n",
"\n",
"# if(re.match(r'\\w|\\β€œ',xxxx) != None ):\n",
" if(re.match(r'\\w|\\β€œ',xxxx3) != None ):\n",
" print()\n",
"# print(xxxx3)\n",
" #pattern match\n",
"# texts = re.sub(r'\\.\\s+','. ',xxxx)\n",
"# texts = re.sub(r'\\s{2}',' \\'',texts)\n",
" texts = xxxx3\n",
" texts = re.sub(r'\\s{2,}',' \\'',texts)\n",
" texts = re.sub(r'\\.\\s+','. ',texts)\n",
" texts = re.sub(r'\\?\\s+','? ',texts)\n",
" texts = re.sub(r'\\!\\s+','! ',texts)\n",
" texts = re.sub(r'\\,\\s+',', ',texts)\n",
" print()\n",
"# print(index, xxxx)\n",
" print(index, texts)\n",
" if len(newtag) > 0:\n",
" for link_v in newtag:\n",
" print('newtag text:',link_v.text)\n",
" print('newtag val:',link_v)\n",
" counter += 1\n",
" try:\n",
" texts = re.sub(rf\"{link_v.text}\",f\"β€Œπ“ƒ‘{link_v.text}π“ƒ‘βœ¦βœ§{counter}βœ§βœΈβ€Œ\",texts)\n",
"# texts = re.sub(rf\"{link_v.text}\",'<span class=\"e;notranslate\"e;>' + f\"π“ƒ΅β˜½π“ƒ‘β˜½βœΈβœ¦βœ¦{link_v.text}π“ƒ‘βœ¦βœ¦βœ§{counter}✧✸\"+'</span>',texts)\n",
" print('texts :',texts)\n",
" except:\n",
" print('error')\n",
" texts = link_v.text\n",
"\n",
" try:\n",
" print()\n",
" print('translated:')\n",
"# translator = GoogleTranslator(source='auto', target=lang)\n",
" translated = translator.translate(text=texts)\n",
" print(index, translated)\n",
"# translated = translator.translate(str(texts), dest=lang)\n",
"# print(index, translated.text)\n",
" print('______________________________')\n",
"# list0[index].string = translated.text\n",
" list0[index].string = translated\n",
" if len(newtag) > 0:\n",
" for link in newtag:\n",
" counter2 += 1\n",
" div = soup.new_tag('div')\n",
" div.string = '✦link✧✸' + str(counter2) + ':'\n",
" div.append(link)\n",
" list0[index].append(div)\n",
"\n",
" except:\n",
"# time.sleep(5)\n",
" print('translated: fail')\n",
"\n",
" return link_list,link_words_list,soup\n",
"\n",
"translator = GoogleTranslator(source='auto', target=lang)\n",
"links1,word1,soup = trans(h6tag_list_0,translator,counter)\n",
"links2,word2,soup = trans(ptag_list_0,translator,counter)\n",
"del translator\n",
"#trans(ptag_list_0,lang)\n",
"#trans(h6tag_list_0,lang)\n",
"\n",
"links3 = []\n",
"if links1 != None and links2 != None:\n",
" links3 = links1 + links2\n",
"elif links1 != None:\n",
" links3 = links1\n",
"else:\n",
" pass\n",
"\n",
"word3 = []\n",
"if word1 != None and word2 != None:\n",
" word3 = word1 + word2\n",
"elif word1 != None:\n",
" word3 = word1\n",
"else:\n",
" pass\n",
"\n",
"metatag = soup.new_tag('meta')\n",
"metatag.attrs['charset'] = \"utf-8\"\n",
"soup.head.append(metatag)\n",
"\n",
"#import os\n",
"#filename = os.path.basename(url)\n",
"filename = title[0:6] + '.html'\n",
"filename = re.sub(r'\\/','_',filename)\n",
"\n",
"with open(filename, \"wb\") as f_output:\n",
" f_output.write(soup.prettify(\"utf-8\"))\n",
"\n",
"# β€Œπ“ƒ΅π“ƒ‘β˜½βœΈβœ¦βœ¦ π“ƒ‘βœ¦βœ¦βœ§ ✧✸\n",
"\n",
"file = open(filename, \"r\", encoding='utf-8')\n",
"line_list = file.readlines()\n",
"newtext = \"\"\n",
"re_pattern = re.compile(r\"(𓃑\\S+?π“ƒ‘βœ¦βœ§\\S+?✧✸)\")\n",
"re_pattern2 = re.compile(r\"(✦✧\\S+?✧✸)\")\n",
"for linebyline in line_list:\n",
" temp_1 = []\n",
" temp_2 = []\n",
" #a_link_num = re.findall(r'𓃑\\S+?π“ƒ‘βœ¦βœ§(\\d+?)✧✸',linebyline)\n",
" a_link_num = re.findall(r'𓃑.*?π“ƒ‘βœ¦βœ§(\\S+?)✧✸',linebyline)\n",
" if len(a_link_num) > 0:\n",
" temp_0 = []\n",
" line2 = linebyline\n",
" for i,v in enumerate(a_link_num):\n",
" if not v in temp_0:\n",
" temp_2.append(v)\n",
" temp_0.append(v)\n",
" print('a_link_num:',i,v)\n",
" num = int(v)\n",
"\n",
" #extract_words = re.finditer(r\"𓃑(\\S+?)π“ƒ‘βœ¦βœ§\\d+?✧✸\",linebyline)\n",
" extract_words = re.finditer(r\"𓃑(\\S+?)π“ƒ‘βœ¦βœ§\\S+?✧✸\",linebyline)\n",
"\n",
" if extract_words != None:\n",
" if num < len(links3):\n",
" for iew,w in enumerate(extract_words):\n",
" ws = str(w.group()) #link_words ...translated word\n",
" if not ws in temp_1:\n",
" temp_1.append(ws)\n",
" print(ws)\n",
" matc = re.findall(re_pattern,line2)\n",
" if len(matc) > 0:\n",
" for ms in matc:\n",
" if (ms.find(ws)) != -1:\n",
"\n",
" link_number = re.match(r'𓃑\\S+?π“ƒ‘βœ¦βœ§(?P<number>\\S+?)✧✸',ws)\n",
" #print('link_number:',link_number.groups()[0])\n",
" # linl_number.groups()[0] == link_number.group('number')\n",
" print('link_number:',link_number.group('number'))\n",
" number = int(link_number.groups()[0])\n",
" embed_link = str(links3[number - 1])\n",
" word = str(word3[number-1])\n",
" print('non skipped')\n",
" striped_ws = re.sub(r'π“‚€|✸|✦|𓃑|','',ws)\n",
" print(striped_ws)\n",
" if (bool(re.search(rf\"{ws}\",line2))==True):\n",
" print(line2)\n",
" line2 = line2.replace(ws,f\"<a href={embed_link}>{striped_ws}</a>\",1)\n",
" print(line2)\n",
" #line2 = re.sub(r'π“‚€|✸|✦|𓃑|','',line2)\n",
" break\n",
"\n",
" else:\n",
" print('skipped!!!')\n",
"\n",
" newtext += line2\n",
" else:\n",
" newtext += linebyline\n",
"\n",
"\n",
" #a_link_num2 = re.findall(r'✦✧(\\d+?)✧✸',line2)\n",
" a_link_num2 = re.findall(r'✦✧(\\S+?)✧✸',newtext)\n",
" if len(a_link_num2) > 0:\n",
" temp_0 = []\n",
" for i,v in enumerate(a_link_num2):\n",
" print('a_link_num2:',i,v)\n",
" if not v in temp_2:\n",
" print(temp_2)\n",
" if not v in temp_0:\n",
" temp_0.append(v)\n",
" print('a_link_num2:',i,v)\n",
" num = int(v)\n",
" extract_words2 = v\n",
" if extract_words2 != None:\n",
" if num < len(links3):\n",
" if not extract_words2 in temp_1:\n",
" temp_1.append(extract_words2)\n",
" print(extract_words2)\n",
" matc = re.findall(re_pattern2,newtext)\n",
" if len(matc) > 0:\n",
" for ms in matc:\n",
" if (ms.find(extract_words2)) != -1:\n",
"\n",
" link_number = num\n",
" print('link_number:',num)\n",
" embed_link = str(links3[num - 1])\n",
" word = str(word3[num - 1])\n",
" print('non skipped')\n",
" newtext= newtext.replace('✦✧'+ extract_words2 + '✧✸',f\"<a href={embed_link}>✦✧{extract_words2}✧✸</a>\")\n",
" newtext = re.sub(r'π“‚€|✸|✦|𓃑|','',newtext)\n",
"\n",
" else:\n",
" print('skipped!!!')\n",
" \n",
" codetag = re.findall(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",newtext)\n",
" if len(codetag) > 0:\n",
" print('code found!')\n",
" for cv in codetag:\n",
" counter_num = re.match(rf\"{nullch}(\\d+?){nullch}\",str(cv))\n",
" print(counter_num)\n",
" match1 = counter_num.group(0)\n",
" i = re.sub(rf\"{nullch}\",'',match1)\n",
" print(\"i:\",i)\n",
" contents = code_contents[int(i)]\n",
" print('code:',contents)\n",
" if len(re.findall(rf\"{match1}\",cv)) != 2:\n",
" #text = re.sub(rf\"{match1}\",contents,str(cv))\n",
" #newtext = re.sub(rf\"{nullch}\\d+?{nullch}\",text,newtext,1)\n",
" continue\n",
" print(cv)\n",
" text = re.sub(rf\"^{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",contents,str(cv))\n",
" #text = re.sub(r'^𓄃\\d+?𓄃','<code>',str(cv))\n",
" #text = re.sub(r'𓄃𓄃\\d+?𓄃𓄃','</code>',str(text))\n",
" newtext = re.sub(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",text,newtext,1)\n",
" #newtext = re.sub(r'𓄃\\d+?𓄃.+?𓄃𓄃\\d+?𓄃𓄃',str(text),newtext,1)\n",
" newtext = re.sub(rf'({nullch}{nullch}\\d+?{nullch}{nullch})','',newtext)\n",
" newtext = re.sub(rf'({nullch}\\d+?{nullch})','',newtext)\n",
" newtext = re.sub(rf'({nullch}\\d+)','',newtext)\n",
" newtext = re.sub(rf'({nullch})','',newtext)\n",
"re.purge()\n",
"file.close()\n",
"\n",
"with open('generated.html', \"w+\", encoding='utf-8') as file:\n",
" file.write(newtext)\n",
"# π“ƒ΅π“ƒ‘β˜½βœΈβœ¦βœ¦ π“ƒ‘βœ¦βœ¦βœ§ ✧✸"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment