Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Last active June 30, 2023 05:29
Show Gist options
  • Save me-suzy/d090fe12ce1fc19ad097f4b5b063c66e to your computer and use it in GitHub Desktop.
Save me-suzy/d090fe12ce1fc19ad097f4b5b063c66e to your computer and use it in GitHub Desktop.
codul
import re
import os
from bs4 import BeautifulSoup
from bs4.formatter import HTMLFormatter
from googletrans import Translator
import requests
translator = Translator()
class UnsortedAttributes(HTMLFormatter):
def attributes(self, tag):
for k, v in tag.attrs.items():
yield k, v
files_from_folder = r"c:\\Folder3\1"
use_translate_folder = True
destination_language = 'ru'
extension_file = ".html"
directory = os.fsencode(files_from_folder)
def recursively_translate(node):
for x in range(len(node.contents)):
if isinstance(node.contents[x], str):
if node.contents[x].strip() != '':
if 'pastebin.com' not in node.contents[x]: # Exclude contents with 'pastebin.com'
try:
node.contents[x].replaceWith(translator.translate(node.contents[x], dest=destination_language).text)
except:
pass
elif node.contents[x] != None:
recursively_translate(node.contents[x])
amount = 1
for file in os.listdir(directory):
filename = os.fsdecode(file)
print(filename)
if filename == 'y_key_e479323ce281e459.html' or filename == 'directory.html':
continue
if filename.endswith(extension_file):
original_html = open(os.path.join(files_from_folder, filename), 'r', encoding='utf-8').read()
with open(os.path.join(files_from_folder, filename), encoding='utf-8') as html:
soup = BeautifulSoup('<pre>' + html.read() + '</pre>', 'html.parser')
# Meta tags replacement
meta_tags = soup.find_all('meta', {'http-equiv': 'Content-Language'}) + \
soup.find_all('meta', {'property': 'og:locale'}) + \
soup.find_all('script', {'type': 'application/ld+json'})
for meta_tag in meta_tags:
if meta_tag.has_attr('content'):
meta_tag['content'] = re.sub(r'\b\w+\b', destination_language, meta_tag['content'])
elif meta_tag.string:
meta_tag.string = re.sub(r'https://neculaifantanaru.com/\b\w+\b/about.html', f'https://neculaifantanaru.com/{destination_language}/about.html', meta_tag.string)
tags_to_update = soup.find_all('meta', {'http-equiv': 'Content-Language'}) + \
soup.find_all('meta', {'property': 'og:locale'}) + \
soup.find_all('meta', {'property': 'og:url'}) + \
soup.find_all('link', {'rel': 'canonical'}) + \
soup.find_all('html') + \
soup.find_all('script', {'type': 'application/ld+json'})
for tag in tags_to_update:
if tag.has_attr('content'):
tag['content'] = re.sub(r'(?<=\.com/)\w+(?=/)', destination_language, tag['content'], count=1)
if tag.has_attr('href'):
tag['href'] = re.sub(r'(?<=\.com/)\w+(?=/)', destination_language, tag['href'], count=1)
if tag.has_attr('lang'):
tag['lang'] = destination_language
if tag.string:
tag.string = re.sub(r'(?<="url": "https://neculaifantanaru\.com/)\w+(?=/about\.html)', destination_language, tag.string, count=1)
for title in soup.findAll('title'):
recursively_translate(title)
for meta in soup.findAll('meta', {'name':'description'}):
try:
meta['content'] = translator.translate(meta['content'], dest=destination_language).text
except:
pass
for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_articol'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(h1)) < end_comment:
recursively_translate(h1)
for p in soup.findAll('p', class_='text_obisnuit'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(p)) < end_comment:
recursively_translate(p)
for p in soup.findAll('p', class_='text_obisnuit2'):
recursively_translate(p)
for p in soup.findAll('p', class_='NOU'):
recursively_translate(p)
for span in soup.findAll('span', class_='text_obisnuit2'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(span)) < end_comment:
recursively_translate(span)
for li in soup.findAll('li', class_='text_obisnuit'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(li)) < end_comment:
recursively_translate(li)
for a in soup.findAll('a', class_='linkMare'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(a)) < end_comment:
recursively_translate(a)
for h4 in soup.findAll('h4', class_='text_obisnuit2'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(h4)) < end_comment:
recursively_translate(h4)
for h5 in soup.findAll('h5', class_='text_obisnuit2'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(h5)) < end_comment:
recursively_translate(h5)
for h1 in soup.findAll('h1', {'itemprop':'name'}, class_='den_webinar'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(h1)) < end_comment:
recursively_translate(h1)
for h3 in soup.findAll('h3', class_='font-weight-normal'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(h3)) < end_comment:
recursively_translate(h3)
for h3 in soup.findAll('h3', class_='font-weight-normal'):
begin_comment = str(soup).index('<!-- Blog List Inner -->')
end_comment = str(soup).index('<!-- ARTICOL START -->')
if begin_comment < str(soup).index(str(h3)) < end_comment:
recursively_translate(h3)
for span in soup.findAll('span', class_='online'):
begin_comment = str(soup).index('<!-- post -->')
end_comment = str(soup).index('<!-- ARTICOL START -->')
if begin_comment < str(soup).index(str(span)) < end_comment:
recursively_translate(span)
for p in soup.findAll('p', class_='mb-40px'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(p)) < end_comment:
recursively_translate(p)
for p in soup.findAll('p', class_='mb-35px color-grey line-height-25px'):
begin_comment = str(soup).index('<!-- ARTICOL START -->')
end_comment = str(soup).index('<!-- ARTICOL FINAL -->')
if begin_comment < str(soup).index(str(p)) < end_comment:
recursively_translate(p)
print(f'{filename} translated ({amount})')
amount += 1
soup = soup.encode(formatter=UnsortedAttributes()).decode('utf-8')
new_filename = f'{filename.split(".")[0]}_{destination_language}.html'
if use_translate_folder:
try:
with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
new_html.write(soup[5:-6])
except:
os.mkdir(files_from_folder+r'\translated')
with open(os.path.join(files_from_folder+r'\translated', new_filename), 'w', encoding='utf-8') as new_html:
new_html.write(soup[5:-6])
else:
with open(os.path.join(files_from_folder, new_filename), 'w', encoding='utf-8') as html:
html.write(soup[5:-6])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment