Skip to content

Instantly share code, notes, and snippets.

@dauuricus
Created June 23, 2022 08:47
Show Gist options
  • Save dauuricus/4283badd8ad38aaf7fe3e31600e82e45 to your computer and use it in GitHub Desktop.
Save dauuricus/4283badd8ad38aaf7fe3e31600e82e45 to your computer and use it in GitHub Desktop.
web scraping & translate jupyternotebook
Display the source blob
Display the rendered blob
Raw
{"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.12"}},"nbformat_minor":5,"nbformat":4,"cells":[{"cell_type":"code","source":"#!pip install googletrans==4.0.0-rc1\n!python -m pip install requests beautifulsoup4\n!pip install deep-translator\n\nimport requests,re\nfrom bs4 import BeautifulSoup\nfrom urllib.parse import urlparse\n\n#from googletrans import Translator\n#translator = Translator()\n\nfrom deep_translator import GoogleTranslator\n#translated = GoogleTranslator(source='auto', target='de').translate(\"keep it up, you are awesome\") # output -> Weiter so, du bist großartig\n\nimport sys\nurl = 'https://www.independent.co.uk/voices/julian-assange-wife-stella-moris-extradition-wikileaks-b2106602.html'\nlang = 'ja'\nuAgent = {'User-Agent': \"Mozilla/5.0 (Linux; Android 9) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36\",'Referer': 'https://www.google.com/'}\nse = requests.Session()\nres = se.get(url,headers=uAgent)\nsch = urlparse(res.url).scheme\nbase = urlparse(res.url).netloc\n\n#import os\n#result = os.popen(\"curl -s \" + url).read()\n#sch = urlparse(url).scheme\n#base = urlparse(url).netloc\n#soup = BeautifulSoup(result, \"html.parser\")\nsoup = BeautifulSoup(res.text, \"html.parser\")\nres.close()\ndel se\n\nptag_list_0 = soup.find_all('p')\nh6tag_list_0 = soup.find_all('h6')\ntitle_list_0 = soup.find_all('title')\nprint(title_list_0[0].text)\ntitle = re.sub(r'\\s','_',title_list_0[0].text)\n\nlink = soup.find_all('link')\nif len(link) > 0:\n for index,v in enumerate(link):\n if not v.has_attr('rel'):\n continue\n# print(index,v['rel'])\n if v['rel'] == [\"stylesheet\"]:\n #css location\n #print(type(v))\n if not v.has_attr('href'):\n #if ('href') in v:\n continue\n# print(v['href'])\n if (bool(re.match(r'^http',v['href']))==False):\n if (bool(re.match(r'^\\/',v['href']))==True):\n link[index]['href'] = sch + \"://\" + base + v['href']\n else:\n link[index]['href'] = sch + \"://\" + base + '/' +v['href']\n print(link[index]['href'])\n\nimage = soup.find_all('img')\nif len(image) > 0:\n for index,im in enumerate(image):\n# continue\n# print(index,im)\n #if im['alt'] == \"Bellingcat\" or im['alt'] == \"GIJNlogo\":\n if not im.has_attr('src'):\n continue\n if (bool(re.match(r'^http',im['src']))==False):\n print(im['src'])\n # image[index]['src'] = 'https://www.bellingcat.com' + im['src']\n if (bool(re.match(r'^\\/',im['src']))==True):\n image[index]['src'] = sch + '://' + base + im['src']\n print(index,image[index]['src'])\n else:\n image[index]['src'] = sch + '://' + base + '/' + im['src']\n print(index,image[index]['src'])\n\nimport time\ncounter = 0\ndef trans(list0,translator,counter):\n#def trans(list0,lang):\n link_list = []\n link_words_list = []\n\n for index,lines in enumerate(list0):\n counter2 = counter\n print()\n print(index, lines)\n# xxxx = lines.text.strip()\n #xxxx1 = re.finditer(r'\\b((\\=|\\.|\\d|\\w|[ -;:,\"“’\\'&\\?\\!\\.])*(?!([^<]*>)))',str(lines))\n\n #(?<=\\<p\\>)(.+)(?=\\<\\/p)\n #(\\w|,|\\.|\\&|\\=|;|([ —-]))+(?!([^<]*>))\n\n soup2 = BeautifulSoup(str(lines), \"html.parser\")\n a_link = soup2.find_all('a')\n newtag = []\n if len(a_link) > 0:\n for i,v in enumerate(a_link):\n #link_words = re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',str(v))\n link_href = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(v))\n for ilh,lh in enumerate(link_href):\n if (bool(re.search(r'^http',lh))==False):\n if (bool(re.search(r'^\\/',lh))==True):\n link_href[ilh] = sch + '://' + base + lh\n else:\n link_href[ilh] = sch + '://' + base + '/' + lh\n link_words = v.text\n print()\n print(\"words\",link_words)\n print(\"a link:\",link_href)\n link_list.append(link_href)\n link_words_list.append(link_words)\n\n if len(link_words) > 0:\n tag = soup.new_tag('a',href= link_href[0])\n if link_words != '':\n tag.string = link_words\n elif link_words == False:\n tag.string = str(link_href[0])\n else:\n tag.string = str(link_href[0])\n newtag.append(tag)\n\n print(newtag)\n\n xxxx1 = re.finditer(r'((\\.|\\d|\\w|&|\\=|[ \\(\\)\\-;:,%#+…|\"“’‘”\\'&\\?\\!\\.])*(?!([^<]*>)))',str(lines))\n xxxx2 = \"\"\n for word in xxxx1:\n xxxx2 += word[1] + ' '\n print()\n print(xxxx2)\n\n# mark_words = []\n# mark_words2 = []\n#\n# link_addr = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(lines))\n# if len(link_addr) > 0:\n# atag = re.findall(r'(?<=\\<a).+?(?=\\<\\/a)',str(lines))\n# print(atag)\n# for a_text in atag:\n# mark_words += re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',a_text)\n# for v in mark_words:\n# strvv = ' '.join(v)\n# mark_words2.append(strvv.strip())\n# print(\"words\",mark_words2)\n# print('link:',link_addr)\n\n xxxx3 = re.sub(r\"\\s{3,}\",' ',xxxx2.strip())\n \n print()\n print(xxxx3)\n\n# if(re.match(r'\\w|\\“',xxxx) != None ):\n if(re.match(r'\\w|\\“',xxxx3) != None ):\n print()\n# print(xxxx3)\n #pattern match\n# texts = re.sub(r'\\.\\s+','. ',xxxx)\n# texts = re.sub(r'\\s{2}',' \\'',texts)\n texts = re.sub(r'\\s{2,}',' \\'',xxxx3)\n texts = re.sub(r'\\.\\s+','. ',texts)\n texts = re.sub(r'\\?\\s+','? ',texts)\n texts = re.sub(r'\\!\\s+','! ',texts)\n texts = re.sub(r'\\,\\s+',', ',texts)\n print()\n# print(index, xxxx)\n print(index, texts)\n if len(newtag) > 0:\n for link_v in newtag:\n print('newtag text:',link_v.text)\n print('newtag val:',link_v)\n counter += 1\n try:\n texts = re.sub(rf\"{link_v.text}\",f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\",texts)\n# texts = re.sub(rf\"{link_v.text}\",'<span class=\"e;notranslate\"e;>' + f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\"+'</span>',texts)\n print('texts :',texts)\n except:\n print('error')\n texts = link_v.text\n\n try:\n print()\n print('translated:')\n# translator = GoogleTranslator(source='auto', target=lang)\n translated = translator.translate(text=texts)\n print(index, translated)\n# translated = translator.translate(str(texts), dest=lang)\n# print(index, translated.text)\n print('______________________________')\n# list0[index].string = translated.text\n list0[index].string = translated\n if len(newtag) > 0:\n for link in newtag:\n counter2 += 1\n div = soup.new_tag('div')\n div.string = '✦link✧✸' + str(counter2) + ':'\n div.append(link)\n list0[index].append(div)\n\n except:\n# time.sleep(5)\n print('translated: fail')\n\n return link_list,link_words_list,soup\n\ntranslator = GoogleTranslator(source='auto', target=lang)\nlinks1,word1,soup = trans(h6tag_list_0,translator,counter)\nlinks2,word2,soup = trans(ptag_list_0,translator,counter)\ndel translator\n#trans(ptag_list_0,lang)\n#trans(h6tag_list_0,lang)\n\nlinks3 = []\nif links1 != None and links2 != None:\n links3 = links1 + links2\nelif links1 != None:\n links3 = links1\nelse:\n pass\n\nword3 = []\nif word1 != None and word2 != None:\n word3 = word1 + word2\nelif word1 != None:\n word3 = word1\nelse:\n pass\n\nmetatag = soup.new_tag('meta')\nmetatag.attrs['charset'] = \"utf-8\"\nsoup.head.append(metatag)\n\n#import os\n#filename = os.path.basename(url)\nfilename = title[0:6] + '.html'\n\nwith open(filename, \"wb\") as f_output:\n f_output.write(soup.prettify(\"utf-8\"))\n\n# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸\n\nfile = open(filename, \"r\", encoding='utf-8')\nline_list = file.readlines()\nnewtext = \"\"\nre_pattern = re.compile(r\"(𓃡☽✸✦{2}\\S+?𓃡✦{2}✧\\d+?✧✸)\")\nfor linebyline in line_list:\n temp_1 = []\n a_link_num = re.findall(r'𓃡☽✸✦{2}\\S+?𓃡✦{2}✧(\\d+?)✧✸',linebyline)\n if len(a_link_num) > 0:\n temp_0 = []\n line2 = linebyline\n for i,v in enumerate(a_link_num):\n if not v in temp_0:\n temp_0.append(v)\n print('a_link_num:',i,v)\n num = int(v)\n\n extract_words = re.finditer(r\"𓃡☽✸✦{2}(\\S+?)𓃡✦{2}✧\\d+?✧✸\",linebyline)\n\n if extract_words != None:\n if num < len(links3):\n for iew,w in enumerate(extract_words):\n ws = str(w.group()) #link_words ...translated word\n if not ws in temp_1:\n temp_1.append(ws)\n print(ws)\n matc = re.findall(re_pattern,line2)\n if len(matc) > 0:\n for ms in matc:\n if (ms.find(ws)) != -1:\n\n link_number = re.match(r'𓃡☽✸✦{2}\\S+?𓃡✦{2}✧(?P<number>\\d+?)✧✸',ws)\n #print('link_number:',link_number.groups()[0])\n # linl_number.groups()[0] == link_number.group('number')\n print('link_number:',link_number.group('number'))\n number = int(link_number.groups()[0])\n embed_link = str(*links3[number-1])\n word = str(word3[number-1])\n print('non skipped')\n line2 = line2.replace(ws,f\"<a href={embed_link}>{ws}</a>\")\n\n else:\n print('skipped!!!')\n newtext += line2\n else:\n newtext += linebyline\n newtext = re.sub(r'𓃵|☽|✸|✦✦|𓃡☽|𓃡','',newtext)\nre.purge()\nfile.close()\n\nwith open('generated.html', \"w+\", encoding='utf-8') as file:\n file.write(newtext)\n# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸","metadata":{},"execution_count":null,"outputs":[],"id":"9d57ff27-4c22-4189-b05d-8df11ac86f67"}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment