dauuricus · June 30, 2022 05:31
diff --git a/bellingcat4.ipynb b/bellingcat4.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d57ff27-4c22-4189-b05d-8df11ac86f67",
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install googletrans==4.0.0-rc1\n",
    "!python -m pip install requests beautifulsoup4\n",
    "!pip install deep-translator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f183580d-3e75-4d0d-b7f7-be3bcba8ef79",
   "metadata": {},
   "outputs": [],
   "source": [
    "url ='https://mybinder.readthedocs.io/en/latest/introduction.html#what-is-mybinder-org'\n",
    "lang = 'ja'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24b56ee3-a4ca-42bf-8ec8-7e6d347eb5b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "##!pip install googletrans==4.0.0-rc1\n",
    "##!pip install deep-translator\n",
    "##!python -m pip install requests beautifulsoup4\n",
    "\n",
    "import requests,re\n",
    "from bs4 import BeautifulSoup\n",
    "from urllib.parse import urlparse\n",
    "\n",
    "#from googletrans import Translator\n",
    "#translator = Translator()\n",
    "\n",
    "from deep_translator import GoogleTranslator\n",
    "#translated = GoogleTranslator(source='auto', target='de').translate(\"keep it up, you are awesome\")  # output -> Weiter so, du bist großartig\n",
    "\n",
    "uAgent = {'User-Agent': \"Mozilla/5.0 (Linux; Android 9) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36\",'Referer': 'https://www.google.com/'}\n",
    "se = requests.Session()\n",
    "res = se.get(url,headers=uAgent)\n",
    "sch = urlparse(res.url).scheme\n",
    "base = urlparse(res.url).netloc\n",
    "import os\n",
    "urldirname = os.path.dirname(res.url)\n",
    "\n",
    "import pathlib\n",
    "basepath = pathlib.Path(res.url)\n",
    "\n",
    "#import os\n",
    "#result = os.popen(\"curl -s \" + url).read()\n",
    "#sch = urlparse(url).scheme\n",
    "#base = urlparse(url).netloc\n",
    "#soup = BeautifulSoup(result, \"html.parser\")\n",
    "soup = BeautifulSoup(res.text, \"html.parser\")\n",
    "res.close()\n",
    "del se\n",
    "\n",
    "ptag_list_0 = soup.find_all('p')\n",
    "h6tag_list_0 = soup.find_all('h6')\n",
    "title_list_0 = soup.find_all('title')\n",
    "print(title_list_0[0].text)\n",
    "title = re.sub(r'\\s','_',title_list_0[0].text)\n",
    "\n",
    "nullch = '𓄃'\n",
    "#nullch = '𓂃'\n",
    "code_tag = soup.find_all('code')\n",
    "code_counter = 0\n",
    "code_contents = []\n",
    "if len(code_tag) > 0:\n",
    "    for index,tag in enumerate(code_tag):\n",
    "        print(index,tag)\n",
    "        if re.match(r'\\<code(\\S|\\s).*?\\>',str(tag)):\n",
    "            code_contents.append(str(tag))\n",
    "            strip_tag = re.sub(r'\\<code(\\S|\\s)*?>',f\"{nullch}{code_counter}{nullch}\",str(tag))\n",
    "            strip_tag = re.sub(r'\\<\\/code\\>',f\"{nullch}{nullch}{code_counter}{nullch}{nullch}\",strip_tag)\n",
    "            code_tag[index].string = strip_tag\n",
    "            print(index,strip_tag)\n",
    "            code_counter += 1\n",
    "\n",
    "link = soup.find_all('link')\n",
    "if len(link) > 0:\n",
    "    for index,v in enumerate(link):\n",
    "        if not v.has_attr('rel'):\n",
    "            continue\n",
    "#        print(index,v['rel'])\n",
    "        if v['rel'] == [\"stylesheet\"]:\n",
    "            #css location\n",
    "            #print(type(v))\n",
    "            if not v.has_attr('href'):\n",
    "           #if ('href') in v:\n",
    "                continue\n",
    "#            print(v['href'])\n",
    "            if (bool(re.match(r'^http',v['href']))==False):\n",
    "                print(v['href'])\n",
    "                if (bool(re.match(r'^\\/',v['href']))==True):\n",
    "#                    link[index]['href'] = sch + \"://\" + base + v['href']\n",
    "                    link[index]['href'] = urldirname + v['href']\n",
    "                else:\n",
    "                    if re.match(r'^\\.',v['href']):\n",
    "                        temp_work = pathlib.Path(str(basepath) + '/'+ v['href']).resolve()\n",
    "                        link[index]['href'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
    "                    else:\n",
    "#                        link[index]['href'] = sch + \"://\" + base + '/' +v['href']\n",
    "                        link[index]['href'] = urldirname + '/' +v['href']\n",
    "                print(link[index]['href'])\n",
    "\n",
    "image = soup.find_all('img')\n",
    "if len(image) > 0:\n",
    "    for index,im in enumerate(image):\n",
    "#        print(index,im)\n",
    "        #if im['alt'] == \"Bellingcat\" or im['alt'] == \"GIJNlogo\":\n",
    "        if not im.has_attr('src'):\n",
    "            continue\n",
    "        if (bool(re.match(r'^http',im['src']))==False):\n",
    "            print(im['src'])\n",
    "        #    image[index]['src'] = 'https://www.bellingcat.com' + im['src']\n",
    "            if (bool(re.match(r'^\\/',im['src']))==True):\n",
    "#                image[index]['src'] = sch + '://' + base + im['src']\n",
    "                image[index]['src'] = urldirname + '/' + im['src']\n",
    "            else:\n",
    "                if re.match(r'^\\.',im['src']):\n",
    "                    temp_work = pathlib.Path(str(basepath) + '/'+ im['src']).resolve()\n",
    "                    image[index]['src'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
    "                else:\n",
    "#                    image[index]['src'] = sch + '://' + base + '/' + im['src']\n",
    "                    image[index]['src'] = urldirname + '/' + im['src']\n",
    "            print(index,image[index]['src'])\n",
    "\n",
    "\n",
    "import time\n",
    "counter = 0\n",
    "def trans(list0,translator,counter):\n",
    "#def trans(list0,lang):\n",
    "    link_list = []\n",
    "    link_words_list = []\n",
    "\n",
    "    for index,lines in enumerate(list0):\n",
    "        counter2 = counter\n",
    "        print()\n",
    "        print(index, lines)\n",
    "#        xxxx = lines.text.strip()\n",
    "        #xxxx1 = re.finditer(r'\\b((\\=|\\.|\\d|\\w|[ -;:,\"“’\\'&\\?\\!\\.])*(?!([^<]*>)))',str(lines))\n",
    "\n",
    "        #(?<=\\<p\\>)(.+)(?=\\<\\/p)\n",
    "        #(\\w|,|\\.|\\&|\\=|;|([ —-]))+(?!([^<]*>))\n",
    "\n",
    "        soup2 = BeautifulSoup(str(lines), \"html.parser\")\n",
    "        a_link = soup2.find_all('a')\n",
    "        newtag = []\n",
    "        if len(a_link) > 0:\n",
    "            for i,v in enumerate(a_link):\n",
    "                #link_words = re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',str(v))\n",
    "                if v.has_attr('href'):\n",
    "                    pass\n",
    "                else:\n",
    "                    continue\n",
    "                link_href = v.get('href')\n",
    "                if (bool(re.search(r'^http',link_href))==False):\n",
    "                    if (bool(re.match(r'^\\/',link_href))==True):\n",
    "                        temp_work = pathlib.Path(str(basepath) + link_href).resolve()\n",
    "                        link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
    "                        #link_href = sch + '://' + base + link_href\n",
    "                    else:\n",
    "                        temp_work = pathlib.Path(str(basepath) + '\\/'+ link_href).resolve()\n",
    "                        link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
    "                        #link_href = sch + '://' + base + '/' + link_href\n",
    "                link_words = v.text\n",
    "                print()\n",
    "                print(\"words\",link_words)\n",
    "                print(\"a link:\",link_href)\n",
    "                link_list.append(link_href)\n",
    "                link_words_list.append(link_words)\n",
    "\n",
    "                if len(link_words) > 0:\n",
    "                    tag = soup.new_tag('a',href= link_href)\n",
    "                    if link_words != '':\n",
    "                        tag.string = link_words\n",
    "                    elif link_words == False:\n",
    "                        tag.string = str(link_href)\n",
    "                    else:\n",
    "                        tag.string = str(link_href)\n",
    "                    newtag.append(tag)\n",
    "\n",
    "        print(newtag)\n",
    "        xxxx0 = re.sub(r'\\<p\\>|\\<\\/p\\>','',str(lines))\n",
    "        xxxx1 = re.finditer(r'((\\.|\\d|\\w|&|\\=|[ \\/\\(\\)\\-;:,├%#+…|\"“’‘”\\'&\\?\\!\\.])*(?!([^<]*>)))',xxxx0)\n",
    "        xxxx2 = \"\"\n",
    "        for word in xxxx1:\n",
    "            t = word[1]\n",
    "            xxxx2 += t + '𓂀'\n",
    "        print()\n",
    "        print(xxxx2)\n",
    "\n",
    "#        mark_words = []\n",
    "#        mark_words2 = []\n",
    "#\n",
    "#        link_addr = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(lines))\n",
    "#        if len(link_addr) > 0:\n",
    "#            atag = re.findall(r'(?<=\\<a).+?(?=\\<\\/a)',str(lines))\n",
    "#            print(atag)\n",
    "#            for a_text in atag:\n",
    "#                mark_words += re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',a_text)\n",
    "#            for v in mark_words:\n",
    "#                strvv = ' '.join(v)\n",
    "#                mark_words2.append(strvv.strip())\n",
    "#        print(\"words\",mark_words2)\n",
    "#        print('link:',link_addr)\n",
    "\n",
    "        #xxxx3 = re.sub(r\"\\s{3,}\",' ',xxxx2.strip())\n",
    "        xxxx3 = re.sub(r\"𓂀\",'',xxxx2.strip())\n",
    "\n",
    "        print()\n",
    "        print(xxxx3)\n",
    "\n",
    "#        if(re.match(r'\\w|\\“',xxxx) != None ):\n",
    "        if(re.match(r'\\w|\\“',xxxx3) != None ):\n",
    "            print()\n",
    "#            print(xxxx3)\n",
    "            #pattern match\n",
    "#            texts = re.sub(r'\\.\\s+','. ',xxxx)\n",
    "#            texts = re.sub(r'\\s{2}',' \\'',texts)\n",
    "            texts = xxxx3\n",
    "            texts = re.sub(r'\\s{2,}',' \\'',texts)\n",
    "            texts = re.sub(r'\\.\\s+','. ',texts)\n",
    "            texts = re.sub(r'\\?\\s+','? ',texts)\n",
    "            texts = re.sub(r'\\!\\s+','! ',texts)\n",
    "            texts = re.sub(r'\\,\\s+',', ',texts)\n",
    "            print()\n",
    "#            print(index, xxxx)\n",
    "            print(index, texts)\n",
    "            if len(newtag) > 0:\n",
    "                for link_v in newtag:\n",
    "                    print('newtag text:',link_v.text)\n",
    "                    print('newtag val:',link_v)\n",
    "                    counter += 1\n",
    "                    try:\n",
    "                        texts = re.sub(rf\"{link_v.text}\",f\"‌𓃡{link_v.text}𓃡✦✧{counter}✧✸‌\",texts)\n",
    "#                       texts = re.sub(rf\"{link_v.text}\",'<span class=\"e;notranslate\"e;>' + f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\"+'</span>',texts)\n",
    "                        print('texts :',texts)\n",
    "                    except:\n",
    "                        print('error')\n",
    "                        texts = link_v.text\n",
    "\n",
    "            try:\n",
    "                print()\n",
    "                print('translated:')\n",
    "#                translator = GoogleTranslator(source='auto', target=lang)\n",
    "                translated = translator.translate(text=texts)\n",
    "                print(index, translated)\n",
    "#                translated = translator.translate(str(texts), dest=lang)\n",
    "#                print(index, translated.text)\n",
    "                print('______________________________')\n",
    "#                list0[index].string = translated.text\n",
    "                list0[index].string = translated\n",
    "                if len(newtag) > 0:\n",
    "                    for link in newtag:\n",
    "                        counter2 += 1\n",
    "                        div = soup.new_tag('div')\n",
    "                        div.string = '✦link✧✸' + str(counter2) + ':'\n",
    "                        div.append(link)\n",
    "                        list0[index].append(div)\n",
    "\n",
    "            except:\n",
    "#                time.sleep(5)\n",
    "                print('translated: fail')\n",
    "\n",
    "    return link_list,link_words_list,soup\n",
    "\n",
    "translator = GoogleTranslator(source='auto', target=lang)\n",
    "links1,word1,soup = trans(h6tag_list_0,translator,counter)\n",
    "links2,word2,soup = trans(ptag_list_0,translator,counter)\n",
    "del translator\n",
    "#trans(ptag_list_0,lang)\n",
    "#trans(h6tag_list_0,lang)\n",
    "\n",
    "links3 = []\n",
    "if links1 != None and links2 != None:\n",
    "    links3 = links1 + links2\n",
    "elif links1 != None:\n",
    "    links3 = links1\n",
    "else:\n",
    "    pass\n",
    "\n",
    "word3 = []\n",
    "if word1 != None and word2 != None:\n",
    "    word3 = word1 + word2\n",
    "elif word1 != None:\n",
    "    word3 = word1\n",
    "else:\n",
    "    pass\n",
    "\n",
    "metatag = soup.new_tag('meta')\n",
    "metatag.attrs['charset'] = \"utf-8\"\n",
    "soup.head.append(metatag)\n",
    "\n",
    "#import os\n",
    "#filename = os.path.basename(url)\n",
    "filename = title[0:6] + '.html'\n",
    "filename = re.sub(r'\\/','_',filename)\n",
    "\n",
    "with open(filename, \"wb\") as f_output:\n",
    "    f_output.write(soup.prettify(\"utf-8\"))\n",
    "\n",
    "# ‌𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸\n",
    "\n",
    "file = open(filename, \"r\", encoding='utf-8')\n",
    "line_list = file.readlines()\n",
    "newtext = \"\"\n",
    "re_pattern = re.compile(r\"(𓃡\\S+?𓃡✦✧\\S+?✧✸)\")\n",
    "re_pattern2 = re.compile(r\"(✦✧\\S+?✧✸)\")\n",
    "for linebyline in line_list:\n",
    "    temp_1 = []\n",
    "    temp_2 = []\n",
    "    #a_link_num = re.findall(r'𓃡\\S+?𓃡✦✧(\\d+?)✧✸',linebyline)\n",
    "    a_link_num = re.findall(r'𓃡.*?𓃡✦✧(\\S+?)✧✸',linebyline)\n",
    "    if len(a_link_num) > 0:\n",
    "        temp_0 = []\n",
    "        line2 = linebyline\n",
    "        for i,v in enumerate(a_link_num):\n",
    "            if not v in temp_0:\n",
    "                temp_2.append(v)\n",
    "                temp_0.append(v)\n",
    "                print('a_link_num:',i,v)\n",
    "                num = int(v)\n",
    "\n",
    "                #extract_words = re.finditer(r\"𓃡(\\S+?)𓃡✦✧\\d+?✧✸\",linebyline)\n",
    "                extract_words = re.finditer(r\"𓃡(\\S+?)𓃡✦✧\\S+?✧✸\",linebyline)\n",
    "\n",
    "                if extract_words != None:\n",
    "                    if num < len(links3):\n",
    "                        for iew,w in enumerate(extract_words):\n",
    "                            ws = str(w.group()) #link_words ...translated word\n",
    "                            if not ws in temp_1:\n",
    "                                temp_1.append(ws)\n",
    "                                print(ws)\n",
    "                                matc = re.findall(re_pattern,line2)\n",
    "                                if len(matc) > 0:\n",
    "                                    for ms in matc:\n",
    "                                        if (ms.find(ws)) != -1:\n",
    "\n",
    "                                            link_number = re.match(r'𓃡\\S+?𓃡✦✧(?P<number>\\S+?)✧✸',ws)\n",
    "                                            #print('link_number:',link_number.groups()[0])\n",
    "                                            # linl_number.groups()[0] == link_number.group('number')\n",
    "                                            print('link_number:',link_number.group('number'))\n",
    "                                            number = int(link_number.groups()[0])\n",
    "                                            embed_link = str(links3[number - 1])\n",
    "                                            word = str(word3[number-1])\n",
    "                                            print('non skipped')\n",
    "                                            striped_ws = re.sub(r'𓂀|✸|✦|𓃡|','',ws)\n",
    "                                            print(striped_ws)\n",
    "                                            if (bool(re.search(rf\"{ws}\",line2))==True):\n",
    "                                                print(line2)\n",
    "                                                line2 = line2.replace(ws,f\"<a href={embed_link}>{striped_ws}</a>\",1)\n",
    "                                                print(line2)\n",
    "                                            #line2 = re.sub(r'𓂀|✸|✦|𓃡|','',line2)\n",
    "                                            break\n",
    "\n",
    "                                else:\n",
    "                                    print('skipped!!!')\n",
    "\n",
    "        newtext += line2\n",
    "    else:\n",
    "        newtext += linebyline\n",
    "\n",
    "\n",
    "    #a_link_num2 = re.findall(r'✦✧(\\d+?)✧✸',line2)\n",
    "    a_link_num2 = re.findall(r'✦✧(\\S+?)✧✸',newtext)\n",
    "    if len(a_link_num2) > 0:\n",
    "        temp_0 = []\n",
    "        for i,v in enumerate(a_link_num2):\n",
    "            print('a_link_num2:',i,v)\n",
    "            if not v in temp_2:\n",
    "                print(temp_2)\n",
    "                if not v in temp_0:\n",
    "                    temp_0.append(v)\n",
    "                    print('a_link_num2:',i,v)\n",
    "                    num = int(v)\n",
    "                    extract_words2 = v\n",
    "                    if extract_words2 != None:\n",
    "                        if num < len(links3):\n",
    "                            if not extract_words2 in temp_1:\n",
    "                                temp_1.append(extract_words2)\n",
    "                                print(extract_words2)\n",
    "                                matc = re.findall(re_pattern2,newtext)\n",
    "                                if len(matc) > 0:\n",
    "                                    for ms in matc:\n",
    "                                        if (ms.find(extract_words2)) != -1:\n",
    "\n",
    "                                            link_number = num\n",
    "                                            print('link_number:',num)\n",
    "                                            embed_link = str(links3[num - 1])\n",
    "                                            word = str(word3[num - 1])\n",
    "                                            print('non skipped')\n",
    "                                            newtext= newtext.replace('✦✧'+ extract_words2 + '✧✸',f\"<a href={embed_link}>✦✧{extract_words2}✧✸</a>\")\n",
    "                                            newtext = re.sub(r'𓂀|✸|✦|𓃡|','',newtext)\n",
    "\n",
    "                                else:\n",
    "                                    print('skipped!!!')\n",
    "    \n",
    "    codetag = re.findall(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",newtext)\n",
    "    if len(codetag) > 0:\n",
    "        print('code found!')\n",
    "        for cv in codetag:\n",
    "            counter_num = re.match(rf\"{nullch}(\\d+?){nullch}\",str(cv))\n",
    "            print(counter_num)\n",
    "            match1 = counter_num.group(0)\n",
    "            i = re.sub(rf\"{nullch}\",'',match1)\n",
    "            print(\"i:\",i)\n",
    "            contents = code_contents[int(i)]\n",
    "            print('code:',contents)\n",
    "            if len(re.findall(rf\"{match1}\",cv)) != 2:\n",
    "                #text = re.sub(rf\"{match1}\",contents,str(cv))\n",
    "                #newtext = re.sub(rf\"{nullch}\\d+?{nullch}\",text,newtext,1)\n",
    "                continue\n",
    "            print(cv)\n",
    "            text = re.sub(rf\"^{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",contents,str(cv))\n",
    "            #text = re.sub(r'^𓄃\\d+?𓄃','<code>',str(cv))\n",
    "            #text = re.sub(r'𓄃𓄃\\d+?𓄃𓄃','</code>',str(text))\n",
    "            newtext = re.sub(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",text,newtext,1)\n",
    "            #newtext = re.sub(r'𓄃\\d+?𓄃.+?𓄃𓄃\\d+?𓄃𓄃',str(text),newtext,1)\n",
    "    newtext = re.sub(rf'({nullch}{nullch}\\d+?{nullch}{nullch})','',newtext)\n",
    "    newtext = re.sub(rf'({nullch}\\d+?{nullch})','',newtext)\n",
    "    newtext = re.sub(rf'({nullch}\\d+)','',newtext)\n",
    "    newtext = re.sub(rf'({nullch})','',newtext)\n",
    "re.purge()\n",
    "file.close()\n",
    "\n",
    "with open('generated.html', \"w+\", encoding='utf-8') as file:\n",
    "    file.write(newtext)\n",
    "# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "9d57ff27-4c22-4189-b05d-8df11ac86f67",
	"metadata": {},
	"outputs": [],
	"source": [
	"#!pip install googletrans==4.0.0-rc1\n",
	"!python -m pip install requests beautifulsoup4\n",
	"!pip install deep-translator"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "f183580d-3e75-4d0d-b7f7-be3bcba8ef79",
	"metadata": {},
	"outputs": [],
	"source": [
	"url ='https://mybinder.readthedocs.io/en/latest/introduction.html#what-is-mybinder-org'\n",
	"lang = 'ja'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "24b56ee3-a4ca-42bf-8ec8-7e6d347eb5b1",
	"metadata": {},
	"outputs": [],
	"source": [
	"##!pip install googletrans==4.0.0-rc1\n",
	"##!pip install deep-translator\n",
	"##!python -m pip install requests beautifulsoup4\n",
	"\n",
	"import requests,re\n",
	"from bs4 import BeautifulSoup\n",
	"from urllib.parse import urlparse\n",
	"\n",
	"#from googletrans import Translator\n",
	"#translator = Translator()\n",
	"\n",
	"from deep_translator import GoogleTranslator\n",
	"#translated = GoogleTranslator(source='auto', target='de').translate(\"keep it up, you are awesome\") # output -> Weiter so, du bist großartig\n",
	"\n",
	"uAgent = {'User-Agent': \"Mozilla/5.0 (Linux; Android 9) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36\",'Referer': 'https://www.google.com/'}\n",
	"se = requests.Session()\n",
	"res = se.get(url,headers=uAgent)\n",
	"sch = urlparse(res.url).scheme\n",
	"base = urlparse(res.url).netloc\n",
	"import os\n",
	"urldirname = os.path.dirname(res.url)\n",
	"\n",
	"import pathlib\n",
	"basepath = pathlib.Path(res.url)\n",
	"\n",
	"#import os\n",
	"#result = os.popen(\"curl -s \" + url).read()\n",
	"#sch = urlparse(url).scheme\n",
	"#base = urlparse(url).netloc\n",
	"#soup = BeautifulSoup(result, \"html.parser\")\n",
	"soup = BeautifulSoup(res.text, \"html.parser\")\n",
	"res.close()\n",
	"del se\n",
	"\n",
	"ptag_list_0 = soup.find_all('p')\n",
	"h6tag_list_0 = soup.find_all('h6')\n",
	"title_list_0 = soup.find_all('title')\n",
	"print(title_list_0[0].text)\n",
	"title = re.sub(r'\\s','_',title_list_0[0].text)\n",
	"\n",
	"nullch = '𓄃'\n",
	"#nullch = '𓂃'\n",
	"code_tag = soup.find_all('code')\n",
	"code_counter = 0\n",
	"code_contents = []\n",
	"if len(code_tag) > 0:\n",
	" for index,tag in enumerate(code_tag):\n",
	" print(index,tag)\n",
	" if re.match(r'\\<code(\\S\|\\s).*?\\>',str(tag)):\n",
	" code_contents.append(str(tag))\n",
	" strip_tag = re.sub(r'\\<code(\\S\|\\s)*?>',f\"{nullch}{code_counter}{nullch}\",str(tag))\n",
	" strip_tag = re.sub(r'\\<\\/code\\>',f\"{nullch}{nullch}{code_counter}{nullch}{nullch}\",strip_tag)\n",
	" code_tag[index].string = strip_tag\n",
	" print(index,strip_tag)\n",
	" code_counter += 1\n",
	"\n",
	"link = soup.find_all('link')\n",
	"if len(link) > 0:\n",
	" for index,v in enumerate(link):\n",
	" if not v.has_attr('rel'):\n",
	" continue\n",
	"# print(index,v['rel'])\n",
	" if v['rel'] == [\"stylesheet\"]:\n",
	" #css location\n",
	" #print(type(v))\n",
	" if not v.has_attr('href'):\n",
	" #if ('href') in v:\n",
	" continue\n",
	"# print(v['href'])\n",
	" if (bool(re.match(r'^http',v['href']))==False):\n",
	" print(v['href'])\n",
	" if (bool(re.match(r'^\\/',v['href']))==True):\n",
	"# link[index]['href'] = sch + \"://\" + base + v['href']\n",
	" link[index]['href'] = urldirname + v['href']\n",
	" else:\n",
	" if re.match(r'^\\.',v['href']):\n",
	" temp_work = pathlib.Path(str(basepath) + '/'+ v['href']).resolve()\n",
	" link[index]['href'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
	" else:\n",
	"# link[index]['href'] = sch + \"://\" + base + '/' +v['href']\n",
	" link[index]['href'] = urldirname + '/' +v['href']\n",
	" print(link[index]['href'])\n",
	"\n",
	"image = soup.find_all('img')\n",
	"if len(image) > 0:\n",
	" for index,im in enumerate(image):\n",
	"# print(index,im)\n",
	" #if im['alt'] == \"Bellingcat\" or im['alt'] == \"GIJNlogo\":\n",
	" if not im.has_attr('src'):\n",
	" continue\n",
	" if (bool(re.match(r'^http',im['src']))==False):\n",
	" print(im['src'])\n",
	" # image[index]['src'] = 'https://www.bellingcat.com' + im['src']\n",
	" if (bool(re.match(r'^\\/',im['src']))==True):\n",
	"# image[index]['src'] = sch + '://' + base + im['src']\n",
	" image[index]['src'] = urldirname + '/' + im['src']\n",
	" else:\n",
	" if re.match(r'^\\.',im['src']):\n",
	" temp_work = pathlib.Path(str(basepath) + '/'+ im['src']).resolve()\n",
	" image[index]['src'] = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
	" else:\n",
	"# image[index]['src'] = sch + '://' + base + '/' + im['src']\n",
	" image[index]['src'] = urldirname + '/' + im['src']\n",
	" print(index,image[index]['src'])\n",
	"\n",
	"\n",
	"import time\n",
	"counter = 0\n",
	"def trans(list0,translator,counter):\n",
	"#def trans(list0,lang):\n",
	" link_list = []\n",
	" link_words_list = []\n",
	"\n",
	" for index,lines in enumerate(list0):\n",
	" counter2 = counter\n",
	" print()\n",
	" print(index, lines)\n",
	"# xxxx = lines.text.strip()\n",
	" #xxxx1 = re.finditer(r'\\b((\\=\|\\.\|\\d\|\\w\|[ -;:,\"“’\\'&\\?\\!\\.])(?!([^<]>)))',str(lines))\n",
	"\n",
	" #(?<=\\<p\\>)(.+)(?=\\<\\/p)\n",
	" #(\\w\|,\|\\.\|\\&\|\\=\|;\|([ —-]))+(?!([^<]*>))\n",
	"\n",
	" soup2 = BeautifulSoup(str(lines), \"html.parser\")\n",
	" a_link = soup2.find_all('a')\n",
	" newtag = []\n",
	" if len(a_link) > 0:\n",
	" for i,v in enumerate(a_link):\n",
	" #link_words = re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',str(v))\n",
	" if v.has_attr('href'):\n",
	" pass\n",
	" else:\n",
	" continue\n",
	" link_href = v.get('href')\n",
	" if (bool(re.search(r'^http',link_href))==False):\n",
	" if (bool(re.match(r'^\\/',link_href))==True):\n",
	" temp_work = pathlib.Path(str(basepath) + link_href).resolve()\n",
	" link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
	" #link_href = sch + '://' + base + link_href\n",
	" else:\n",
	" temp_work = pathlib.Path(str(basepath) + '\\/'+ link_href).resolve()\n",
	" link_href = re.sub(rf\"^\\S+?(?={base})\",sch + \"://\",str(temp_work))\n",
	" #link_href = sch + '://' + base + '/' + link_href\n",
	" link_words = v.text\n",
	" print()\n",
	" print(\"words\",link_words)\n",
	" print(\"a link:\",link_href)\n",
	" link_list.append(link_href)\n",
	" link_words_list.append(link_words)\n",
	"\n",
	" if len(link_words) > 0:\n",
	" tag = soup.new_tag('a',href= link_href)\n",
	" if link_words != '':\n",
	" tag.string = link_words\n",
	" elif link_words == False:\n",
	" tag.string = str(link_href)\n",
	" else:\n",
	" tag.string = str(link_href)\n",
	" newtag.append(tag)\n",
	"\n",
	" print(newtag)\n",
	" xxxx0 = re.sub(r'\\<p\\>\|\\<\\/p\\>','',str(lines))\n",
	" xxxx1 = re.finditer(r'((\\.\|\\d\|\\w\|&\|\\=\|[ \\/\\(\\)\\-;:,├%#+…\|\"“’‘”\\'&\\?\\!\\.])(?!([^<]>)))',xxxx0)\n",
	" xxxx2 = \"\"\n",
	" for word in xxxx1:\n",
	" t = word[1]\n",
	" xxxx2 += t + '𓂀'\n",
	" print()\n",
	" print(xxxx2)\n",
	"\n",
	"# mark_words = []\n",
	"# mark_words2 = []\n",
	"#\n",
	"# link_addr = re.findall(r'(?<=href\\=\\\").+?(?=\\\")',str(lines))\n",
	"# if len(link_addr) > 0:\n",
	"# atag = re.findall(r'(?<=\\<a).+?(?=\\<\\/a)',str(lines))\n",
	"# print(atag)\n",
	"# for a_text in atag:\n",
	"# mark_words += re.findall(r'\\b(\\w+?(?!([^<]*>)))\\b',a_text)\n",
	"# for v in mark_words:\n",
	"# strvv = ' '.join(v)\n",
	"# mark_words2.append(strvv.strip())\n",
	"# print(\"words\",mark_words2)\n",
	"# print('link:',link_addr)\n",
	"\n",
	" #xxxx3 = re.sub(r\"\\s{3,}\",' ',xxxx2.strip())\n",
	" xxxx3 = re.sub(r\"𓂀\",'',xxxx2.strip())\n",
	"\n",
	" print()\n",
	" print(xxxx3)\n",
	"\n",
	"# if(re.match(r'\\w\|\\“',xxxx) != None ):\n",
	" if(re.match(r'\\w\|\\“',xxxx3) != None ):\n",
	" print()\n",
	"# print(xxxx3)\n",
	" #pattern match\n",
	"# texts = re.sub(r'\\.\\s+','. ',xxxx)\n",
	"# texts = re.sub(r'\\s{2}',' \\'',texts)\n",
	" texts = xxxx3\n",
	" texts = re.sub(r'\\s{2,}',' \\'',texts)\n",
	" texts = re.sub(r'\\.\\s+','. ',texts)\n",
	" texts = re.sub(r'\\?\\s+','? ',texts)\n",
	" texts = re.sub(r'\\!\\s+','! ',texts)\n",
	" texts = re.sub(r'\\,\\s+',', ',texts)\n",
	" print()\n",
	"# print(index, xxxx)\n",
	" print(index, texts)\n",
	" if len(newtag) > 0:\n",
	" for link_v in newtag:\n",
	" print('newtag text:',link_v.text)\n",
	" print('newtag val:',link_v)\n",
	" counter += 1\n",
	" try:\n",
	" texts = re.sub(rf\"{link_v.text}\",f\"‌𓃡{link_v.text}𓃡✦✧{counter}✧✸‌\",texts)\n",
	"# texts = re.sub(rf\"{link_v.text}\",'<span class=\"e;notranslate\"e;>' + f\"𓃵☽𓃡☽✸✦✦{link_v.text}𓃡✦✦✧{counter}✧✸\"+'</span>',texts)\n",
	" print('texts :',texts)\n",
	" except:\n",
	" print('error')\n",
	" texts = link_v.text\n",
	"\n",
	" try:\n",
	" print()\n",
	" print('translated:')\n",
	"# translator = GoogleTranslator(source='auto', target=lang)\n",
	" translated = translator.translate(text=texts)\n",
	" print(index, translated)\n",
	"# translated = translator.translate(str(texts), dest=lang)\n",
	"# print(index, translated.text)\n",
	" print('______________________________')\n",
	"# list0[index].string = translated.text\n",
	" list0[index].string = translated\n",
	" if len(newtag) > 0:\n",
	" for link in newtag:\n",
	" counter2 += 1\n",
	" div = soup.new_tag('div')\n",
	" div.string = '✦link✧✸' + str(counter2) + ':'\n",
	" div.append(link)\n",
	" list0[index].append(div)\n",
	"\n",
	" except:\n",
	"# time.sleep(5)\n",
	" print('translated: fail')\n",
	"\n",
	" return link_list,link_words_list,soup\n",
	"\n",
	"translator = GoogleTranslator(source='auto', target=lang)\n",
	"links1,word1,soup = trans(h6tag_list_0,translator,counter)\n",
	"links2,word2,soup = trans(ptag_list_0,translator,counter)\n",
	"del translator\n",
	"#trans(ptag_list_0,lang)\n",
	"#trans(h6tag_list_0,lang)\n",
	"\n",
	"links3 = []\n",
	"if links1 != None and links2 != None:\n",
	" links3 = links1 + links2\n",
	"elif links1 != None:\n",
	" links3 = links1\n",
	"else:\n",
	" pass\n",
	"\n",
	"word3 = []\n",
	"if word1 != None and word2 != None:\n",
	" word3 = word1 + word2\n",
	"elif word1 != None:\n",
	" word3 = word1\n",
	"else:\n",
	" pass\n",
	"\n",
	"metatag = soup.new_tag('meta')\n",
	"metatag.attrs['charset'] = \"utf-8\"\n",
	"soup.head.append(metatag)\n",
	"\n",
	"#import os\n",
	"#filename = os.path.basename(url)\n",
	"filename = title[0:6] + '.html'\n",
	"filename = re.sub(r'\\/','_',filename)\n",
	"\n",
	"with open(filename, \"wb\") as f_output:\n",
	" f_output.write(soup.prettify(\"utf-8\"))\n",
	"\n",
	"# ‌𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸\n",
	"\n",
	"file = open(filename, \"r\", encoding='utf-8')\n",
	"line_list = file.readlines()\n",
	"newtext = \"\"\n",
	"re_pattern = re.compile(r\"(𓃡\\S+?𓃡✦✧\\S+?✧✸)\")\n",
	"re_pattern2 = re.compile(r\"(✦✧\\S+?✧✸)\")\n",
	"for linebyline in line_list:\n",
	" temp_1 = []\n",
	" temp_2 = []\n",
	" #a_link_num = re.findall(r'𓃡\\S+?𓃡✦✧(\\d+?)✧✸',linebyline)\n",
	" a_link_num = re.findall(r'𓃡.*?𓃡✦✧(\\S+?)✧✸',linebyline)\n",
	" if len(a_link_num) > 0:\n",
	" temp_0 = []\n",
	" line2 = linebyline\n",
	" for i,v in enumerate(a_link_num):\n",
	" if not v in temp_0:\n",
	" temp_2.append(v)\n",
	" temp_0.append(v)\n",
	" print('a_link_num:',i,v)\n",
	" num = int(v)\n",
	"\n",
	" #extract_words = re.finditer(r\"𓃡(\\S+?)𓃡✦✧\\d+?✧✸\",linebyline)\n",
	" extract_words = re.finditer(r\"𓃡(\\S+?)𓃡✦✧\\S+?✧✸\",linebyline)\n",
	"\n",
	" if extract_words != None:\n",
	" if num < len(links3):\n",
	" for iew,w in enumerate(extract_words):\n",
	" ws = str(w.group()) #link_words ...translated word\n",
	" if not ws in temp_1:\n",
	" temp_1.append(ws)\n",
	" print(ws)\n",
	" matc = re.findall(re_pattern,line2)\n",
	" if len(matc) > 0:\n",
	" for ms in matc:\n",
	" if (ms.find(ws)) != -1:\n",
	"\n",
	" link_number = re.match(r'𓃡\\S+?𓃡✦✧(?P<number>\\S+?)✧✸',ws)\n",
	" #print('link_number:',link_number.groups()[0])\n",
	" # linl_number.groups()[0] == link_number.group('number')\n",
	" print('link_number:',link_number.group('number'))\n",
	" number = int(link_number.groups()[0])\n",
	" embed_link = str(links3[number - 1])\n",
	" word = str(word3[number-1])\n",
	" print('non skipped')\n",
	" striped_ws = re.sub(r'𓂀\|✸\|✦\|𓃡\|','',ws)\n",
	" print(striped_ws)\n",
	" if (bool(re.search(rf\"{ws}\",line2))==True):\n",
	" print(line2)\n",
	" line2 = line2.replace(ws,f\"<a href={embed_link}>{striped_ws}</a>\",1)\n",
	" print(line2)\n",
	" #line2 = re.sub(r'𓂀\|✸\|✦\|𓃡\|','',line2)\n",
	" break\n",
	"\n",
	" else:\n",
	" print('skipped!!!')\n",
	"\n",
	" newtext += line2\n",
	" else:\n",
	" newtext += linebyline\n",
	"\n",
	"\n",
	" #a_link_num2 = re.findall(r'✦✧(\\d+?)✧✸',line2)\n",
	" a_link_num2 = re.findall(r'✦✧(\\S+?)✧✸',newtext)\n",
	" if len(a_link_num2) > 0:\n",
	" temp_0 = []\n",
	" for i,v in enumerate(a_link_num2):\n",
	" print('a_link_num2:',i,v)\n",
	" if not v in temp_2:\n",
	" print(temp_2)\n",
	" if not v in temp_0:\n",
	" temp_0.append(v)\n",
	" print('a_link_num2:',i,v)\n",
	" num = int(v)\n",
	" extract_words2 = v\n",
	" if extract_words2 != None:\n",
	" if num < len(links3):\n",
	" if not extract_words2 in temp_1:\n",
	" temp_1.append(extract_words2)\n",
	" print(extract_words2)\n",
	" matc = re.findall(re_pattern2,newtext)\n",
	" if len(matc) > 0:\n",
	" for ms in matc:\n",
	" if (ms.find(extract_words2)) != -1:\n",
	"\n",
	" link_number = num\n",
	" print('link_number:',num)\n",
	" embed_link = str(links3[num - 1])\n",
	" word = str(word3[num - 1])\n",
	" print('non skipped')\n",
	" newtext= newtext.replace('✦✧'+ extract_words2 + '✧✸',f\"<a href={embed_link}>✦✧{extract_words2}✧✸</a>\")\n",
	" newtext = re.sub(r'𓂀\|✸\|✦\|𓃡\|','',newtext)\n",
	"\n",
	" else:\n",
	" print('skipped!!!')\n",
	" \n",
	" codetag = re.findall(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",newtext)\n",
	" if len(codetag) > 0:\n",
	" print('code found!')\n",
	" for cv in codetag:\n",
	" counter_num = re.match(rf\"{nullch}(\\d+?){nullch}\",str(cv))\n",
	" print(counter_num)\n",
	" match1 = counter_num.group(0)\n",
	" i = re.sub(rf\"{nullch}\",'',match1)\n",
	" print(\"i:\",i)\n",
	" contents = code_contents[int(i)]\n",
	" print('code:',contents)\n",
	" if len(re.findall(rf\"{match1}\",cv)) != 2:\n",
	" #text = re.sub(rf\"{match1}\",contents,str(cv))\n",
	" #newtext = re.sub(rf\"{nullch}\\d+?{nullch}\",text,newtext,1)\n",
	" continue\n",
	" print(cv)\n",
	" text = re.sub(rf\"^{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",contents,str(cv))\n",
	" #text = re.sub(r'^𓄃\\d+?𓄃','<code>',str(cv))\n",
	" #text = re.sub(r'𓄃𓄃\\d+?𓄃𓄃','</code>',str(text))\n",
	" newtext = re.sub(rf\"{nullch}\\d+?{nullch}.+?{nullch}{nullch}\\d+?{nullch}{nullch}\",text,newtext,1)\n",
	" #newtext = re.sub(r'𓄃\\d+?𓄃.+?𓄃𓄃\\d+?𓄃𓄃',str(text),newtext,1)\n",
	" newtext = re.sub(rf'({nullch}{nullch}\\d+?{nullch}{nullch})','',newtext)\n",
	" newtext = re.sub(rf'({nullch}\\d+?{nullch})','',newtext)\n",
	" newtext = re.sub(rf'({nullch}\\d+)','',newtext)\n",
	" newtext = re.sub(rf'({nullch})','',newtext)\n",
	"re.purge()\n",
	"file.close()\n",
	"\n",
	"with open('generated.html', \"w+\", encoding='utf-8') as file:\n",
	" file.write(newtext)\n",
	"# 𓃵𓃡☽✸✦✦ 𓃡✦✦✧ ✧✸"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}