Skip to content

Instantly share code, notes, and snippets.

@dauuricus
Created June 24, 2022 04:49
Show Gist options
  • Save dauuricus/b06fbdb60dda27d9e56673f00082d811 to your computer and use it in GitHub Desktop.
Save dauuricus/b06fbdb60dda27d9e56673f00082d811 to your computer and use it in GitHub Desktop.
for binder
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "04f91d55-f2b4-4e7d-8ef7-b9e5a65cb48c",
"metadata": {},
"source": [
"## install \n",
"requests,beautifulsoup4,deep-translator"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed08d1f3-0f3d-406f-9c28-07a15c598ffd",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"!python -m pip install requests beautifulsoup4\n",
"#!pip install googletrans==4.0.0-rc1\n",
"!pip install deep-translator"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79d676c5-90f3-42ed-b552-6f4f1b443bc4",
"metadata": {},
"outputs": [],
"source": [
"url = 'https://www.independent.co.uk/voices/julian-assange-wife-stella-moris-extradition-wikileaks-b2106602.html'\n",
"lang = 'ja'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8188ecad-7c27-4346-a201-6bc875a8965f",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import requests,re\n",
"from bs4 import BeautifulSoup\n",
"from urllib.parse import urlparse\n",
"\n",
"#googletrans case:\n",
"#from googletrans import Translator\n",
"#translator = Translator()\n",
"\n",
"from deep_translator import GoogleTranslator\n",
"#translated = GoogleTranslator(source='auto', target='de').translate(\"keep it up, you are awesome\") # output -> Weiter so, du bist großartig\n",
"\n",
"\n",
"#url = 'https://www.independent.co.uk/voices/julian-assange-wife-stella-moris-extradition-wikileaks-b2106602.html'\n",
"#lang = 'ja'\n",
"uAgent = {'User-Agent': \"Mozilla/5.0 (Linux; Android 9) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36\",'Referer': 'https://www.google.com/'}\n",
"se = requests.Session()\n",
"res = se.get(url,headers=uAgent)\n",
"sch = urlparse(res.url).scheme\n",
"base = urlparse(res.url).netloc\n",
"\n",
"#import os\n",
"#result = os.popen(\"curl -s \" + url).read()\n",
"#sch = urlparse(url).scheme\n",
"#base = urlparse(url).netloc\n",
"#soup = BeautifulSoup(result, \"html.parser\")\n",
"soup = BeautifulSoup(res.text, \"html.parser\")\n",
"res.close()\n",
"del se\n",
"\n",
"ptag_list_0 = soup.find_all('p')\n",
"h6tag_list_0 = soup.find_all('h6')\n",
"title_list_0 = soup.find_all('title')\n",
"print(title_list_0[0].text)\n",
"title = re.sub(r'\\s','_',title_list_0[0].text)\n",
"\n",
"link = soup.find_all('link')\n",
"if len(link) > 0:\n",
" for index,v in enumerate(link):\n",
" if not v.has_attr('rel'):\n",
" continue\n",
"# print(index,v['rel'])\n",
" if v['rel'] == [\"stylesheet\"]:\n",
" #css location\n",
" #print(type(v))\n",
" if not v.has_attr('href'):\n",
" #if ('href') in v:\n",
" continue\n",
"# print(v['href'])\n",
" if (bool(re.match(r'^http',v['href']))==False):\n",
" if (bool(re.match(r'^\\/',v['href']))==True):\n",
" link[index]['href'] = sch + \"://\" + base + v['href']\n",
" else:\n",
" link[index]['href'] = sch + \"://\" + base + '/' +v['href']\n",
" print(link[index]['href'])\n",
"\n",
"image = soup.find_all('img')\n",
"if len(image) > 0:\n",
" for index,im in enumerate(image):\n",
"# continue\n",
"# print(index,im)\n",
" #if im['alt'] == \"Bellingcat\" or im['alt'] == \"GIJNlogo\":\n",
" if not im.has_attr('src'):\n",
" continue\n",
" if (bool(re.match(r'^http',im['src']))==False):\n",
" print(im['src'])\n",
" # image[index]['src'] = 'https://www.bellingcat.com' + im['src']\n",
" if (bool(re.match(r'^\\/',im['src']))==True):\n",
" image[index]['src'] = sch + '://' + base + im['src']\n",
" print(index,image[index]['src'])\n",
" else:\n",
" image[index]['src'] = sch + '://' + base + '/' + im['src']\n",
" print(index,image[index]['src'])\n",
"\n",
"import time\n",
"counter = 0\n",
"def trans(list0,translator,counter):\n",
"#def trans(list0,lang):\n",
" link_list = []\n",
" link_words_list = []\n",
"\n",
" for index,lines in enumerate(list0):\n",
" counter2 = counter\n",
" print()\n",
" print(index, lines)\n",
"# xxxx = lines.text.strip()\n",
"\n",
" #(?<=\\<p\\>)(.+)(?=\\<\\/p)\n",
" #(\\w|,|\\.|\\&|\\=|;|([ —-]))+(?!([^<]*>))\n",
"\n",
" soup2 = BeautifulSoup(str(lines), \"html.parser\")\n",
" a_link = soup2.find_all('a')\n",
" newtag = []\n",
" if len(a_link) > 0:\n",
" for i,v in enumerate(a_link):\n",
" #link_words = re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',str(v))\n",
" link_href = v.get('href')\n",
" if (bool(re.search(r'^http',link_href))==False):\n",
" if (bool(re.search(r'^\\/',link_href))==True):\n",
" link_href = sch + '://' + base + link_href\n",
" else:\n",
" link_href = sch + '://' + base + '/' + link_href\n",
"\n",
" link_words = v.text\n",
" print()\n",
" print(\"words\",link_words)\n",
" print(\"a link:\",link_href)\n",
" link_list.append(link_href)\n",
" link_words_list.append(link_words)\n",
"\n",
" if len(link_words) > 0:\n",
" tag = soup.new_tag('a',href= link_href)\n",
" if link_words != '':\n",
" tag.string = link_words\n",
" elif link_words == False:\n",
" tag.string = str(link_href)\n",
" else:\n",
" tag.string = str(link_href)\n",
" newtag.append(tag)\n",
"\n",
" print(newtag)\n",
"\n",
"\n",
" xxxx1 = re.finditer(r'((\\.|\\d|\\w|&|\\=|[ \\(\\)\\-;:,%#+…|\"“’‘”\\'&\\?\\!\\.])*(?!([^<]*>)))',str(lines))\n",
" xxxx2 = \"\"\n",
" for word in xxxx1:\n",
" xxxx2 += word[1] + ' '\n",
" print()\n",
" print(xxxx2)\n",
"\n",
"# mark_words = []\n",
"# mark_words2 = []\n",
"#\n",
"# link_addr = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(lines))\n",
"# if len(link_addr) > 0:\n",
"# atag = re.findall(r'(?<=\\<a).+?(?=\\<\\/a)',str(lines))\n",
"# print(atag)\n",
"# for a_text in atag:\n",
"# mark_words += re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',a_text)\n",
"# for v in mark_words:\n",
"# strvv = ' '.join(v)\n",
"# mark_words2.append(strvv.strip())\n",
"# print(\"words\",mark_words2)\n",
"# print('link:',link_addr)\n",
"\n",
" xxxx3 = re.sub(r\"\\s{3,}\",' ',xxxx2.strip())\n",
" \n",
" print()\n",
" print(xxxx3)\n",
"\n",
"# if(re.match(r'\\w|\\“',xxxx) != None ):\n",
" if(re.match(r'\\w|\\“',xxxx3) != None ):\n",
" print()\n",
"# print(xxxx3)\n",
" #pattern match\n",
"# texts = re.sub(r'\\.\\s+','. ',xxxx)\n",
"# texts = re.sub(r'\\s{2}',' \\'',texts)\n",
" texts = re.sub(r'\\s{2,}',' \\'',xxxx3)\n",
" texts = re.sub(r'\\.\\s+','. ',texts)\n",
" texts = re.sub(r'\\?\\s+','? ',texts)\n",
" texts = re.sub(r'\\!\\s+','! ',texts)\n",
" texts = re.sub(r'\\,\\s+',', ',texts)\n",
" print()\n",
"# print(index, xxxx)\n",
" print(index, texts)\n",
" if len(newtag) > 0:\n",
" for link_v in newtag:\n",
" print('newtag text:',link_v.text)\n",
" print('newtag val:',link_v)\n",
" counter += 1\n",
" try:\n",
" texts = re.sub(rf\"{link_v.text}\",f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\",texts)\n",
"# texts = re.sub(rf\"{link_v.text}\",'<span class=\"e;notranslate\"e;>' + f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\"+'</span>',texts)\n",
" print('texts :',texts)\n",
" except:\n",
" print('error')\n",
" texts = link_v.text\n",
"\n",
" try:\n",
" print()\n",
" print('translated:')\n",
"# translator = GoogleTranslator(source='auto', target=lang)\n",
" translated = translator.translate(text=texts)\n",
" print(index, translated)\n",
" #googletrans case:\n",
"# translated = translator.translate(str(texts), dest=lang)\n",
"# print(index, translated.text)\n",
" print('______________________________')\n",
"# list0[index].string = translated.text\n",
" list0[index].string = translated\n",
" if len(newtag) > 0:\n",
" for link in newtag:\n",
" counter2 += 1\n",
" div = soup.new_tag('div')\n",
" div.string = '✦link✧✸' + str(counter2) + ':'\n",
" div.append(link)\n",
" list0[index].append(div)\n",
"\n",
" except:\n",
"# time.sleep(5)\n",
" print('translated: fail')\n",
"\n",
" return link_list,link_words_list,soup\n",
"\n",
"#deep-translator case:\n",
"translator = GoogleTranslator(source='auto', target=lang)\n",
"links1,word1,soup = trans(h6tag_list_0,translator,counter)\n",
"links2,word2,soup = trans(ptag_list_0,translator,counter)\n",
"del translator\n",
"#trans(ptag_list_0,lang)\n",
"#trans(h6tag_list_0,lang)\n",
"\n",
"links3 = []\n",
"if links1 != None and links2 != None:\n",
" links3 = links1 + links2\n",
"elif links1 != None:\n",
" links3 = links1\n",
"else:\n",
" pass\n",
"\n",
"word3 = []\n",
"if word1 != None and word2 != None:\n",
" word3 = word1 + word2\n",
"elif word1 != None:\n",
" word3 = word1\n",
"else:\n",
" pass\n",
"\n",
"metatag = soup.new_tag('meta')\n",
"metatag.attrs['charset'] = \"utf-8\"\n",
"soup.head.append(metatag)\n",
"\n",
"#import os\n",
"#filename = os.path.basename(url)\n",
"filename = title[0:6] + '.html'\n",
"\n",
"with open(filename, \"wb\") as f_output:\n",
" f_output.write(soup.prettify(\"utf-8\"))\n",
"\n",
"# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸\n",
"\n",
"file = open(filename, \"r\", encoding='utf-8')\n",
"line_list = file.readlines()\n",
"newtext = \"\"\n",
"re_pattern = re.compile(r\"(𓃡☽✸✦{2}\\S+?𓃡✦{2}✧\\d+?✧✸)\")\n",
"for linebyline in line_list:\n",
" temp_1 = []\n",
" a_link_num = re.findall(r'𓃡☽✸✦{2}\\S+?𓃡✦{2}✧(\\d+?)✧✸',linebyline)\n",
" if len(a_link_num) > 0:\n",
" temp_0 = []\n",
" line2 = linebyline\n",
" for i,v in enumerate(a_link_num):\n",
" if not v in temp_0:\n",
" temp_0.append(v)\n",
" print('a_link_num:',i,v)\n",
" num = int(v)\n",
"\n",
" extract_words = re.finditer(r\"𓃡☽✸✦{2}(\\S+?)𓃡✦{2}✧\\d+?✧✸\",linebyline)\n",
"\n",
" if extract_words != None:\n",
" if num < len(links3):\n",
" for iew,w in enumerate(extract_words):\n",
" ws = str(w.group()) #link_words ...translated word\n",
" if not ws in temp_1:\n",
" temp_1.append(ws)\n",
" print(ws)\n",
" matc = re.findall(re_pattern,line2)\n",
" if len(matc) > 0:\n",
" for ms in matc:\n",
" if (ms.find(ws)) != -1:\n",
"\n",
" link_number = re.match(r'𓃡☽✸✦{2}\\S+?𓃡✦{2}✧(?P<number>\\d+?)✧✸',ws)\n",
" #print('link_number:',link_number.groups()[0])\n",
" # linl_number.groups()[0] == link_number.group('number')\n",
" print('link_number:',link_number.group('number'))\n",
" number = int(link_number.groups()[0])\n",
" embed_link = str(links3[number-1])\n",
" word = str(word3[number-1])\n",
" print('non skipped')\n",
" line2 = line2.replace(ws,f\"<a href={embed_link}>{ws}</a>\")\n",
"\n",
" else:\n",
" print('skipped!!!')\n",
" newtext += line2\n",
" else:\n",
" newtext += linebyline\n",
" newtext = re.sub(r'𓃵|☽|✸|✦✦|𓃡☽|𓃡','',newtext)\n",
"re.purge()\n",
"file.close()\n",
"\n",
"with open('generated.html', \"w+\", encoding='utf-8') as file:\n",
" file.write(newtext)\n",
"# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment