Created
July 5, 2023 06:34
-
-
Save me-suzy/9bb3b344c38b4dab53c6db74c78f9695 to your computer and use it in GitHub Desktop.
ChatGPT - BUN FINAL 17 aprilie
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import regex | |
# Define the FOLDER_LOCAL with your HTML files | |
FOLDER_LOCAL = r'd:\Folder1\fisiere_html\de-convertit\1' | |
# Define the regex pattern | |
meta_pattern = regex.compile(r'<meta content="((?:(?!<meta).)*?)"\sname="description"\s/>', regex.DOTALL) | |
# Initialize list to keep track of modified files | |
modified_files = [] | |
# Initialize the counter | |
counter = 1 | |
# Loop through all files in the FOLDER_LOCAL | |
for filename in os.listdir(FOLDER_LOCAL): | |
# Check if file is a .html or .htm file | |
if filename.endswith('.html') or filename.endswith('.htm'): | |
# Open the HTML file | |
with open(os.path.join(FOLDER_LOCAL, filename), 'r', encoding='utf-8') as file: | |
html = file.read() | |
# Perform the regex substitution | |
new_html = regex.sub(meta_pattern, lambda match: '<meta name="description" content="' + match.group(1).replace("\n", "").replace("\r", "") + '">', html) | |
# Check if any substitutions were made | |
if new_html != html: | |
# Add filename to the list of modified files | |
modified_files.append(filename) | |
# Add canonical link at the beginning with incremental counter | |
new_html = f'<link rel=canonical href="TESTAREA_{counter}">\n' + new_html | |
counter += 1 | |
# Save the new HTML | |
with open(os.path.join(FOLDER_LOCAL, filename), 'w', encoding='utf-8') as file: | |
file.write(new_html) | |
# Print out the modified files | |
for filename in modified_files: | |
print(f'Modified file: {filename}') | |
import os | |
import regex | |
import re | |
import random | |
import unidecode | |
import nltk | |
from nltk import tokenize | |
# nltk.download('punkt') | |
import requests | |
import regex._regex as _regex | |
SITE = 'https://neculaifantanaru.com/' | |
LISTA_CUVINTE_LEGATURA = [ | |
'in', 'la', 'unei', 'si', 'sa', 'se', 'de', 'prin', 'unde', 'care', 'a', | |
'al', 'prea', 'lui', 'din', 'ai', 'unui', 'acei', 'un', 'doar', 'tine', | |
'ale', 'sau', 'dintre', 'intre', 'cu', 'ce', 'va', 'fi', 'este', 'cand', 'o', | |
'cine', 'aceasta', 'ca', 'dar', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', | |
'to', 'was', 'your', 'you', 'is', 'are', 'iar', 'fara', 'asta', 'pe', 'tu', | |
'nu', 'mai', 'ne', 'le', 'intr', 'cum', 'e', 'for', 'she', 'it', 'esti', | |
'this', 'that', 'how', 'can', 't', 'must', 'be', 'the', 'and', 'do', 'so', 'or', 'ori', | |
'who', 'what', 'if', 'of', 'on', 'i', 'we', 'they', 'them', 'but', 'where', 'by', 'an', | |
'mi', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'made', 'my', 'me', | |
'vom', 'voi', 'ei', 'cat', 'ar', 'putea', 'poti', 'sunteti', 'inca', 'still', 'noi', 'l', | |
'ma', 's', 'dupa', 'after', 'under', 'sub', 'niste', 'some', 'those', 'he', 'no', 'too', | |
'fac', 'made', 'make', 'cei', 'most', 'face', 'pentru', 'cat', 'cate', 'much', 'more', 'many', | |
'sale', 'tale', 'tau', 'has', 'sunt', 'his', 'yours', 'only', 'as', 'toate', 'all', 'tot', 'incat', | |
'which', 'ti', 'asa', 'like', 'these', 'because', 'unor', 'caci', 'ele', 'have', 'haven', 'te', | |
'cea', 'else', 'imi', 'iti', 'should', 'could', 'not', 'even', 'chiar', 'when', 'ci', 'ne', 'ni', | |
'her', 'our', 'alta', 'another', 'other', 'decat', 'acelasi', 'same', 'au', 'had', 'haven', 'hasn', | |
'alte', 'alt', 'others', 'ceea', 'cel', 'cele', 'alte', 'despre', 'about', 'acele', 'acel', 'acea', | |
'decit', 'with', '_', 'fata', 'towards', 'against', 'cind', 'dinspre', 'fost', 'been', 'era', 'daca', | |
'eu', 'el', 'him', 'ea', 'will', 'am', 'cannot', 'between', 'cause', 'may', 'couldn', | |
'IN', 'LA', 'UNEI', 'SI', 'SA', 'SE', 'DE', 'PRIN', 'UNDE', 'CARE', 'A', | |
'AL', 'PREA', 'LUI', 'DIN', 'AI', 'UNUI', 'ACEI', 'UN', 'DOAR', 'TINE', | |
'ALE', 'SAU', 'DINTRE', 'INTRE', 'CU', 'CE', 'VA', 'FI', 'ESTE', 'CAND', 'O', | |
'CINE', 'ACEASTA', 'CA', 'DAR', 'TO', 'WAS', 'YOUR', 'YOU', 'IS', 'ARE', 'IAR', 'FARA', 'ASTA', 'PE', 'TU', | |
'NU', 'MAI', 'NE', 'LE', 'INTR', 'CUM', 'E', 'FOR', 'SHE', 'IT', 'ESTI', | |
'THIS', 'THAT', 'HOW', 'CAN', 'T', 'MUST', 'BE', 'THE', 'AND', 'DO', 'SO', 'OR', 'ORI', | |
'WHO', 'WHAT', 'IF', 'OF', 'ON', 'I', 'WE', 'THEY', 'THEM', 'BUT', 'WHERE', 'BY', 'AN', | |
'MI', 'MADE', 'MY', 'ME', 'VOM', 'VOI', 'EI', 'CAT', 'AR', 'PUTEA', 'POTI', 'SUNTETI', 'INCA', 'STILL', 'NOI', 'L', | |
'MA', 'S', 'DUPA', 'AFTER', 'UNDER', 'SUB', 'NISTE', 'SOME', 'THOSE', 'HE', 'NO', 'TOO', | |
'FAC', 'MADE', 'MAKE', 'CEI', 'MOST', 'FACE', 'PENTRU', 'CAT', 'CATE', 'MUCH', 'MORE', 'MANY', | |
'SALE', 'TALE', 'TAU', 'HAS', 'SUNT', 'HIS', 'YOURS', 'ONLY', 'AS', 'TOATE', 'ALL', 'TOT', 'INCAT', | |
'WHICH', 'TI', 'ASA', 'LIKE', 'THESE', 'BECAUSE', 'UNOR', 'CACI', 'ELE', 'HAVE', 'HAVEN', 'TE', | |
'CEA', 'ELSE', 'IMI', 'ITI', 'SHOULD', 'COULD', 'NOT', 'EVEN', 'CHIAR', 'WHEN', 'CI', 'NE', 'NI', | |
'HER', 'OUR', 'ALTA', 'ANOTHER', 'OTHER', 'DECAT', 'ACELASI', 'SAME', 'AU', 'HAD', 'HAVEN', 'HASN', | |
'ALTE', 'ALT', 'OTHERS', 'CEEA', 'CEL', 'CELE', 'ALTE', 'DESPRE', 'ABOUT', 'ACELE', 'ACEL', 'ACEA', | |
'DECIT', 'WITH', '_', 'FATA', 'TOWARDS', 'AGAINST', 'CIND', 'DINSPRE', 'FOST', 'BEEN', 'ERA', 'DACA', | |
'EU', 'EL', 'HIM', 'EA', 'WILL', 'AM', 'CANNOT', 'BETWEEN', 'CAUSE', 'MAY', 'COULDN', | |
'In', 'La', 'Unei', 'Si', 'Sa', 'Se', 'De', 'Prin', 'Unde', 'Care', 'Al', 'Prea', 'Lui', 'Din', 'Ai', 'Unui', | |
'Acei', 'Un', 'Doar', 'Tine', 'Ale', 'Sau', 'Dintre', 'Intre', 'Cu', 'Ce', 'Va', 'Fi', 'Este', 'Cand', 'Cine', 'Aceasta', 'Ca', | |
'Dar', 'Ii', 'Iii', 'Iv', 'V', 'Vi', 'Vii', 'Viii', 'To', 'Was', 'Your', 'You', 'Is', 'Are', 'Iar', 'Fara', 'Asta', 'Pe', 'Tu', | |
'Nu', 'Mai', 'Ne', 'Le', 'Intr', 'Cum', 'For', 'She', 'It', 'Esti', | |
'This', 'That', 'How', 'Can', 'Must', 'Be', 'The', 'And', 'Do', 'So', 'Or', 'Ori', | |
'Who', 'What', 'If', 'Of', 'On', 'We', 'They', 'Them', 'But', 'Where', 'By', 'An', | |
'Mi', 'Made', 'My', 'Me', 'Vom', 'Voi', 'Ei', 'Cat', 'Ar', 'Putea', 'Poti', 'Sunteti', 'Inca', 'Still', 'Noi', | |
'Ma', 'Dupa', 'After', 'Under', 'Sub', 'Niste', 'Some', 'Those', 'He', 'No', 'Too', | |
'Fac', 'Made', 'Make', 'Cei', 'Most', 'Face', 'Pentru', 'Cat', 'Cate', 'Much', 'More', 'Many', | |
'Sale', 'Tale', 'Tau', 'Has', 'Sunt', 'His', 'Yours', 'Only', 'As', 'Toate', 'All', 'Tot', 'Incat', | |
'Which', 'Ti', 'Asa', 'Like', 'These', 'Because', 'Unor', 'Caci', 'Ele', 'Have', 'Haven', 'Te', | |
'Cea', 'Else', 'Imi', 'Iti', 'Should', 'Could', 'Not', 'Even', 'Chiar', 'When', 'Ci', 'Ne', 'Ni', | |
'Her', 'Our', 'Alta', 'Another', 'Other', 'Decat', 'Acelasi', 'Same', 'Au', 'Had', 'Haven', 'Hasn', | |
'Alte', 'Alt', 'Others', 'Ceea', 'Cel', 'Cele', 'Alte', 'Despre', 'About', 'Acele', 'Acel', 'Acea', | |
'Decit', 'With', 'Fata', 'Towards', 'Against', 'Cind', 'Dinspre', 'Fost', 'Been', 'Era', 'Daca', | |
'Eu', 'El', 'Him', 'Ea', 'Will', 'Am', 'Cannot', 'Between', 'Cause', 'May', 'Couldn', 'destul', 'enough', | |
'Destul', 'Enough', 'from', 'FROM', 'From', 'ia', 'Ia', 'IA' | |
] | |
#PATTERN_LINK = "<a href=\"{}\" target=\"_new\">{}</a>" | |
PATTERN_LINK = "<a href=\"{}\" class=\"color-bebe\" target=\"_new\">{}</a>" | |
''' | |
structura dictionar cuvinte | |
{ | |
"cuvantul1": [lista_linkuri1], | |
"cuvantul2": [lista_linkuri2] | |
} | |
''' | |
CALE_FISIER_LINKURI = "d:\\Folder1\\LINKS\\links.txt" | |
# folosim DEF cand vrem sa definim o functie => un cuvant cheie in Python | |
# REGULA: def nume_functie(lista_argumente) | |
def preia_cuvinte_link(link): | |
cuvinte = link.split('.')[0] # [0] ia primul element iar daca pun [1] ia al doilea element | |
cuvinte = cuvinte.split('-') | |
cuvinte_ok = list() | |
for cuv in cuvinte: | |
if cuv not in LISTA_CUVINTE_LEGATURA: | |
cuvinte_ok.append(cuv) | |
return cuvinte_ok # am pus return fiindca voi avea nevoie de rezultatul functiei de mai sus | |
def preia_cuvinte_lista_linkuri(cale_fisier_linkuri): | |
dictionar_cuvinte_linkuri = dict() | |
with open(cale_fisier_linkuri, encoding='utf8') as fp: | |
lines = fp.readlines() | |
for line in lines: | |
# functia preia_cuvinte_link returneaza un rezultat care este salvat in variabila cuvinte_link | |
cuvinte_link = preia_cuvinte_link(line.strip()) | |
for cuv in cuvinte_link: | |
if cuv in dictionar_cuvinte_linkuri.keys(): | |
if SITE + line.strip() not in dictionar_cuvinte_linkuri[cuv]: | |
dictionar_cuvinte_linkuri[cuv].append(SITE + line.strip()) | |
else: | |
dictionar_cuvinte_linkuri[cuv] = [SITE + line.strip()] | |
return dictionar_cuvinte_linkuri | |
def citeste_fisier_linie_cu_linie(cale_fisier): | |
with open(cale_fisier, encoding='utf8') as fp: | |
lines = fp.readlines() | |
count = 0 | |
for line in lines: | |
print(count, line.strip()) | |
count += 1 | |
def read_text_from_file(file_path): | |
""" | |
Aceasta functie returneaza continutul unui fisier. | |
file_path: calea catre fisierul din care vrei sa citesti | |
""" | |
# with open(file_path, encoding='utf8') as f: | |
with open(file_path, encoding='utf8', errors='ignore') as f: | |
text = f.read() | |
return text | |
def write_to_file(text, file_path): | |
""" | |
Aceasta functie scrie un text intr-un fisier. | |
text: textul pe care vrei sa il scrii | |
file_path: calea catre fisierul in care vrei sa scrii | |
""" | |
with open(file_path, 'wb') as f: | |
f.write(text.encode('utf8', 'ignore')) | |
def introducere_linkuri(page, paragrafe): | |
tag = "<p class=\"mb-40px\">{}</p>" | |
text_start_final = "" | |
LINK_INTRODUS = 0 | |
# incepem de la a doua treime din text | |
# start_paragraf = 1 # prima optiune | |
start_paragraf = int(len(paragrafe) / 3) | |
for paragraf in paragrafe[:start_paragraf]: | |
if len(re.findall(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', paragraf)) != 0: | |
paragraf = re.sub(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', r'\4', paragraf) | |
if len(re.findall(r'\n+', paragraf)) != 0: | |
# print(paragraf) | |
paragraf = re.sub(r'\n+', r'', paragraf) | |
text_start_final = text_start_final + '\n' + tag.format(paragraf) | |
for paragraf in paragrafe[start_paragraf:]: | |
if len(re.findall(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', paragraf)) != 0: | |
paragraf = re.sub(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', r'\4', paragraf) | |
lista_cuvinte_gasite = list() | |
if LINK_INTRODUS == 0: | |
# gasim toate cuvintele din paragraful curent | |
cuvinte = re.findall(r' (?:\w|-*\!)+[ ,]', paragraf) | |
dictionar_linkuri = preia_cuvinte_lista_linkuri(CALE_FISIER_LINKURI) | |
for cuv in cuvinte: | |
cuv_fara_semne = cuv.replace(' ', '').replace(',', '') | |
if cuv_fara_semne in dictionar_linkuri.keys(): | |
lista_cuvinte_gasite.append(cuv) | |
# lista de cuvinte gasite in paragraf, dar care se gasesc si in dictionar | |
lista_cuvinte_gasite = list(set(lista_cuvinte_gasite)) | |
# daca s-au gasit cuvinte in paragraf, atunci adaugam link-ul in paragraf | |
if len(lista_cuvinte_gasite) > 0: | |
cuvant_random = random.sample(lista_cuvinte_gasite, 1)[0] | |
cuvant_random_fara_semne = cuvant_random.replace(' ', '').replace(',', '') | |
link_random = random.sample(dictionar_linkuri[cuvant_random_fara_semne], 1)[0] | |
# singur cuvant subliniat | |
''' | |
pattern = PATTERN_LINK.format(link_random, cuvant_random.strip()) | |
paragraf = paragraf.replace(cuvant_random.strip(), pattern, 1) | |
LINK_INTRODUS = 1 | |
''' | |
# doua cuvinte subliniate | |
expresie_regulata = cuvant_random.strip() + r' *\w+' | |
# print("expr: ", expresie_regulata) | |
urmatorul_cuvant = re.findall(expresie_regulata, paragraf) | |
if len(urmatorul_cuvant) == 0: | |
print("Nu am gasit urmatorul cuvant pe pagina {}!!!".format(page)) | |
pattern = PATTERN_LINK.format(link_random, cuvant_random.strip()) | |
paragraf = paragraf.replace(cuvant_random.strip(), pattern, 1) | |
LINK_INTRODUS = 1 | |
else: | |
urmatorul_cuvant = re.findall(expresie_regulata, paragraf)[0] | |
pattern = PATTERN_LINK.format(link_random, urmatorul_cuvant) | |
paragraf = paragraf.replace(urmatorul_cuvant, pattern, 1) | |
LINK_INTRODUS = 1 | |
paragraf = tag.format(paragraf) | |
if len(re.findall(r'\n+', paragraf)) != 0: | |
paragraphs[i] = re.sub(r'\n+', r'', paragraf) | |
text_start_final = text_start_final + '\n' + paragraf | |
if LINK_INTRODUS == 0: | |
print("Nu am introdus niciun link-ul pe pagina: {}.".format(page)) | |
return text_start_final | |
# 1. Preluare site-uri de pe o anumita pagina (vezi variabila PAGE) | |
# FOLDER_LOCAL = 'd:/Folder1/fisiere_html/de-convertit/1' # aici pui fisierele HTML ------------------------------- | |
# DUPA RULAREA CODULUI, SALVAREA NOILOR FISIERE VA AVEA LOC IN d:\Folder1\fisiere_html_modificate\ | |
# INCEPUT CHATGPT | |
page_text_pattern = re.compile('<div class="article-info clearfix">([\s\S]*?)</article>') | |
img_tag_pattern = re.compile('<img[\s\S]*?>') # Acesta este regexul care va căuta toate tagurile <img> | |
html_tag_pattern = re.compile('<[^>]+>') # Acesta este noul pattern pentru eliminarea tagurilor HTML | |
link_replace_pattern = re.compile('<link rel=canonical href="(.*?)">') # pentru înlocuirea tagului <link rel=canonical href= > | |
script_pattern = re.compile('if\(typeof ez_ad_units!=\'undefined\'\){ez_ad_units.push\(\[\[728,90],\'drawandpaintforfun_com-box-3\',\'ezslot_6\',105,\'0\',\'0\'\]\]\);__ez_fad_position\(\'div-gpt-ad-drawandpaintforfun_com-box-3-0\'\);}') | |
page_text_pattern2 = re.compile('<div itemprop="description">([\s\S]*?)<h2 class="is-sr-only">Compartir</h2>') | |
page_text_pattern3 = re.compile('<div class="content">([\s\S]*?)<aside class="block widget widget--share">') | |
page_text_pattern4 = re.compile('<div class="article-post-content">([\s\S]*?)</article>') | |
paragraph_pattern = re.compile('<p.*?>([\s\S]*?)</p>') | |
counter_sterse = 0 | |
INFORMATII_PAGINI = list() | |
for f in os.listdir(FOLDER_LOCAL): | |
if f.endswith('.html') or f.endswith('.htm'): | |
filepath = os.path.join(FOLDER_LOCAL, f) | |
page_html = read_text_from_file(filepath) | |
# Înlocuim tagul <link rel=canonical href= > cu <link rel="canonical" href=" " /> | |
page_html = re.sub(link_replace_pattern, '<link rel="canonical" href="\g<1>" />', page_html) | |
# La acest punct, toate tagurile <img> vor fi înlocuite cu un punct | |
page_html = re.sub(img_tag_pattern, '.', page_html) | |
# La acest punct, scriptul specificat va fi înlocuit cu un punct | |
page_html = re.sub(script_pattern, '.', page_html) | |
# Obținem textul paginii | |
page_text = re.findall(page_text_pattern, page_html) | |
page_text2 = re.findall(page_text_pattern2, page_html) | |
page_text3 = re.findall(page_text_pattern3, page_html) | |
page_text4 = re.findall(page_text_pattern4, page_html) | |
if len(page_text) != 0 or len(page_text2) != 0 or len(page_text3) != 0 or len(page_text4) != 0: | |
if len(page_text) != 0: | |
page_text = page_text[0] | |
elif len(page_text2) != 0: | |
page_text = page_text2[0] | |
elif len(page_text3) != 0: | |
page_text = page_text3[0] | |
else: | |
page_text = page_text4[0] | |
# La acest punct avem 'page_text' care este textul din div-ul selectat | |
# Acum vom înlocui toate tagurile HTML cu un punct | |
page_text_no_html = re.sub(html_tag_pattern, '.', page_text) | |
# FINAL CHAPGPT | |
# inlocuim textul de tipul "text</p>" cu "<p class="mb-40px">text</p>" | |
page_text = re.sub(r'(^.*)(?!<p>)([\s\S]*?)(</p>)', r'<p class="mb-40px">\1\2\3', page_text) | |
# extragem paragrafele <p></p> si construim un text nou | |
paragraphs = re.findall(paragraph_pattern, page_text) | |
new_paragraphs = list() | |
if len(paragraphs) == 0: | |
continue | |
else: | |
for i in range(len(paragraphs)): | |
if '<br />' in paragraphs[i]: | |
if len(re.findall(r'^(.*?)(<br />)', paragraphs[i], flags=re.MULTILINE)) != 0: | |
paragraphs[i] = re.sub(r'^(.*?)(<br />)', r'\1</p>', paragraphs[i], flags=re.MULTILINE) | |
if len(re.findall(r'(^.*)(?!<p>)(.+?)(</p>)', paragraphs[i], flags=re.MULTILINE)) != 0: | |
paragraphs[i] = re.sub(r'(^.*)(?!<p>)(.+?)(</p>)', r'<p class="mb-40px">\1\2\3', paragraphs[i], flags=re.MULTILINE) | |
if len(re.findall(r'^(?!<p class="mb-40px">)(.+?)', paragraphs[i], flags=re.MULTILINE)) != 0: | |
paragraphs[i] = re.sub(r'^(?!<p class="mb-40px">)(.*)', r'<p class="mb-40px">\1</p>', paragraphs[i], flags=re.MULTILINE) | |
if len(re.findall(paragraph_pattern, paragraphs[i])) != 0: | |
for p in re.findall(paragraph_pattern, paragraphs[i]): | |
new_paragraphs.append(p) | |
else: | |
new_paragraphs.append(paragraphs[i]) | |
# introducem link-urile | |
new_page_text = introducere_linkuri(f, new_paragraphs) | |
title_pattern = re.compile('<title>([\s\S]*?)</title>') | |
title_og_pattern = re.compile('<meta property="og:title" content=(.*?) />') | |
if len(re.findall(title_pattern, page_html)) != 0 or len(re.findall(title_og_pattern, page_html)) != 0: | |
if len(re.findall(title_pattern, page_html)) != 0: | |
page_title = re.findall(title_pattern, page_html) | |
else: | |
page_title = re.findall(title_og_pattern, page_html) | |
page_title = page_title[0] | |
# modificare cuvinte titlu | |
title_words = page_title.split(' ') | |
new_title_words = list() | |
for w in title_words: | |
if w.isupper(): | |
new_title_words.append(w.lower().capitalize()) | |
else: | |
new_title_words.append(w) | |
page_title = " ".join(new_title_words) | |
# description | |
description_pattern = re.compile('<meta name="description" content="([\s\S]*?)>') | |
description_og_pattern = re.compile('<meta property="og:description" content="([\s\S]*?)>') | |
text_description = 'MANCARE' | |
if len(re.findall(description_pattern, page_html)) == 0 and len(re.findall(description_og_pattern, page_html)) == 0: | |
print("AM STERS FIINDCA NU ARE TAGURILE DESCRIPTION") | |
# print("Filepath:", filepath) # Printează calea fișierului ce urmează să fie șters | |
# print("Page HTML:", page_html) # Printează conținutul HTML al paginii | |
print("Description pattern results:", re.findall(description_pattern, page_html)) # Printează rezultatele pentru primul pattern | |
print("Description OG pattern results:", re.findall(description_og_pattern, page_html)) # Printează rezultatele pentru al doilea pattern | |
os.remove(filepath) | |
counter_sterse += 1 | |
continue | |
elif len(re.findall(description_pattern, page_html)) != 0 and len(re.findall(description_og_pattern, page_html)) == 0: | |
print("CAZ 1") | |
text_description = re.findall(description_pattern, page_html) | |
text_description = text_description[0] | |
description_model = '<meta name="description" content="{}">'.format(text_description) | |
og_description_model = '<meta property="og:description" content="{}"/>'.format(text_description) | |
page_html = re.sub(r'(<meta name="description" content="[\s\S]*?" />)', description_model + '\n' + og_description_model, page_html) | |
elif len(re.findall(description_pattern, page_html)) == 0 and len(re.findall(description_og_pattern, page_html)) != 0: | |
print("CAZ II") | |
text_description = re.findall(description_og_pattern, page_html) | |
text_description = text_description[0] | |
description_model = '<meta name="description" content="{}">'.format(text_description) | |
og_description_model = '<meta property="og:description" content="{}"/>'.format(text_description) | |
page_html = re.sub(r'(<meta property="og:description" content=".*?" />)', description_model + '\n' + og_description_model, page_html) | |
else: | |
print("CAZ III") | |
text_description = re.findall(description_pattern, page_html) | |
text_description = text_description[0] | |
print("DESCCC: ", text_description) | |
print(page_title) | |
print(f) | |
# canonical | |
canonical_tag_pattern = re.compile('<link rel="canonical" href="(.*?)" />') | |
canonical_og_tag_pattern = re.compile('<meta property="og:url" content="(.*?)" />') | |
if len(re.findall(canonical_tag_pattern, page_html)) != 0 or len(re.findall(canonical_og_tag_pattern, page_html)) != 0: | |
if len(re.findall(canonical_tag_pattern, page_html)) != 0: | |
canonical_tag = re.findall(canonical_tag_pattern, page_html) | |
else: | |
canonical_tag = re.findall(canonical_og_tag_pattern, page_html) | |
canonical_tag = canonical_tag[0] | |
# print("canonical: ", canonical_tag) | |
# adaugare nota finala | |
link_pattern = '<a href={} class="color-bebe" target="_new">{}</a>'.format(canonical_tag, canonical_tag) | |
new_page_text = new_page_text + '\n' + '<p class="mb-40px"><strong>* Sursă:</strong> {} </p>'.format(link_pattern) + '\n' | |
# adaugare informatie | |
informatie = (page_title, text_description, canonical_tag, new_page_text) | |
INFORMATII_PAGINI.append(informatie) | |
else: | |
print("Pagina structura gresita - canonical: ", f) | |
continue | |
else: | |
print("Pagina structura gresita - title: ", f) | |
continue | |
else: | |
print("Pagina structura gresita - text: ", f) | |
continue | |
# page are structura: (page_title, page_description, canonical_tag, new_page_text) | |
def copiaza_continut_txt_html(page, cale_fisier_html): # astea sunt argumentele functiei, adica cand apelez functia | |
text_html = read_text_from_file(cale_fisier_html) | |
# aici e pattern-ul pentru expresia regex; (.*?) inseamna ca preia tot ce este intre tag-uri | |
# modifici expresia regulata in functie de ce tag dai ca argument pentru functie | |
articol_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?') | |
text_articol = re.findall(articol_pattern, text_html) | |
if len(text_articol) != 0: | |
text_articol = text_articol[0] | |
text_html = text_html.replace(text_articol, page[3]) # pe indexul 3 sta new_page_text | |
else: | |
print("Fisier html fara ARTICOL START/FINAL.") | |
title_pattern = re.compile('<title>([\s\S]*?)</title>') | |
text_title = re.findall(title_pattern, text_html) | |
# 01.02.2022: inlocuire h3 cu text titlu (2) - Aici SCHIMBI TAGUL LA TITLUL ARTICOLULUI DIN PAGINA | |
h3_pattern = re.compile('<h3 class=\"font-weight-normal\"><a href=\"javascript:void\(0\)\" class=\"color-black\">(.*?)</a></h3>') | |
text_h3 = re.findall(h3_pattern, text_html) | |
if len(text_title) != 0: | |
text_title = text_title[0] | |
canonical_words = '' | |
if page[2].endswith('/'): | |
canonical_words = page[2].split('/')[-2] | |
else: | |
print("PAGE: ", page[2]) | |
canonical_words = page[2].split('/')[-1] # daca nu se termina cu / sau .html, atunci sa imi ia ultima bucata dupa / | |
if '.' in canonical_words: # in cazul in care se termina cu .html, .htm | |
canonical_words = canonical_words.split('.')[0] | |
# creare nume nou link | |
new_file_name_fara_spatiu = canonical_words + '.html' | |
# inlocuire text titlu cu primele 10 cuvinte | |
text_html = text_html.replace(text_title, page[0]) # page[0] titlul | |
# 01.02.2022: inlocuire h3 cu text titlu (2) | |
if len(text_h3) != 0: | |
text_h3 = text_h3[0] | |
text_html = text_html.replace(text_h3, page[0]) | |
else: | |
print("Fisierul nu are tag-ul h3.") | |
# 07.02.2022: inlocuire text canonical tag | |
canonical_tag_pattern = re.compile('<link rel="canonical" href="(.*?)" />') | |
canonical_tag = re.findall(canonical_tag_pattern, text_html) | |
if len(canonical_tag) != 0: | |
canonical_tag = canonical_tag[0] | |
#text_html = text_html.replace(canonical_tag, new_file_name_fara_spatiu) | |
# daca trebuie sa pui si "https://neculaifantanaru.com/" in fata comentezi linia de mai sus si o decomentezi pe cea de jos | |
text_html = text_html.replace(canonical_tag, "https://trinketbox.ro/" + new_file_name_fara_spatiu) | |
else: | |
print("Fisier fara tag canonical") | |
else: | |
print("Fisier html fara titlu.") | |
description_pattern = re.compile('<meta name="description" content="(.*?)">') | |
text_description = re.findall(description_pattern, text_html) | |
if len(text_description) != 0: | |
text_description = text_description[0] | |
# print("text description: ", text_description) | |
text_html = text_html.replace(text_description, page[1]) # description pe pozitia 1 | |
else: | |
print("Fisier html fara description.") | |
file_path = os.path.dirname(cale_fisier_html) + "\\" + "fisiere_html_modificate" + "\\" + new_file_name_fara_spatiu # in acest folder se duc fisierele FACUTE | |
write_to_file(text_html, file_path) | |
# print("Fisier: ", new_file_name_fara_spatiu) | |
print("Scriere efectuata cu succes.") | |
def creare_fisiere_html(cale_fisier_html): | |
""" | |
Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare | |
""" | |
count = 0 | |
for page in INFORMATII_PAGINI: | |
copiaza_continut_txt_html(page, cale_fisier_html) | |
count += 1 | |
print("Numarul de fisiere modificate: ", count) | |
def main(): | |
creare_fisiere_html("d:\\Folder1\\index_trinketbox.html") # aici este indexul model de la trinketbox.ro | |
# DUPA RULAREA CODULUI, SALVAREA NOILOR FISIERE VA AVEA LOC IN d:\Folder1\fisiere_html_modificate\ | |
# dictionar_cuvinte = preia_cuvinte_lista_linkuri(CALE_FISIER_LINKURI) | |
# print(dictionar_cuvinte) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment