Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created July 4, 2023 16:31
Show Gist options
  • Save me-suzy/058e9f300a35a9cc8daa0794da8c42f0 to your computer and use it in GitHub Desktop.
Save me-suzy/058e9f300a35a9cc8daa0794da8c42f0 to your computer and use it in GitHub Desktop.
tesr
#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: Castel
#
# Created: 22/01/2022
# Copyright: (c) Castel 2022
# Licence: <your licence>
#-------------------------------------------------------------------------------
import os
import regex
import re
import random
import unidecode
import nltk
from nltk import tokenize
# nltk.download('punkt')
import requests
import regex._regex as _regex
SITE = 'https://neculaifantanaru.com/'
LISTA_CUVINTE_LEGATURA = [
'in', 'la', 'unei', 'si', 'sa', 'se', 'de', 'prin', 'unde', 'care', 'a',
'al', 'prea', 'lui', 'din', 'ai', 'unui', 'acei', 'un', 'doar', 'tine',
'ale', 'sau', 'dintre', 'intre', 'cu', 'ce', 'va', 'fi', 'este', 'cand', 'o',
'cine', 'aceasta', 'ca', 'dar', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII',
'to', 'was', 'your', 'you', 'is', 'are', 'iar', 'fara', 'asta', 'pe', 'tu',
'nu', 'mai', 'ne', 'le', 'intr', 'cum', 'e', 'for', 'she', 'it', 'esti',
'this', 'that', 'how', 'can', 't', 'must', 'be', 'the', 'and', 'do', 'so', 'or', 'ori',
'who', 'what', 'if', 'of', 'on', 'i', 'we', 'they', 'them', 'but', 'where', 'by', 'an',
'mi', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', 'made', 'my', 'me',
'vom', 'voi', 'ei', 'cat', 'ar', 'putea', 'poti', 'sunteti', 'inca', 'still', 'noi', 'l',
'ma', 's', 'dupa', 'after', 'under', 'sub', 'niste', 'some', 'those', 'he', 'no', 'too',
'fac', 'made', 'make', 'cei', 'most', 'face', 'pentru', 'cat', 'cate', 'much', 'more', 'many',
'sale', 'tale', 'tau', 'has', 'sunt', 'his', 'yours', 'only', 'as', 'toate', 'all', 'tot', 'incat',
'which', 'ti', 'asa', 'like', 'these', 'because', 'unor', 'caci', 'ele', 'have', 'haven', 'te',
'cea', 'else', 'imi', 'iti', 'should', 'could', 'not', 'even', 'chiar', 'when', 'ci', 'ne', 'ni',
'her', 'our', 'alta', 'another', 'other', 'decat', 'acelasi', 'same', 'au', 'had', 'haven', 'hasn',
'alte', 'alt', 'others', 'ceea', 'cel', 'cele', 'alte', 'despre', 'about', 'acele', 'acel', 'acea',
'decit', 'with', '_', 'fata', 'towards', 'against', 'cind', 'dinspre', 'fost', 'been', 'era', 'daca',
'eu', 'el', 'him', 'ea', 'will', 'am', 'cannot', 'between', 'cause', 'may', 'couldn',
'IN', 'LA', 'UNEI', 'SI', 'SA', 'SE', 'DE', 'PRIN', 'UNDE', 'CARE', 'A',
'AL', 'PREA', 'LUI', 'DIN', 'AI', 'UNUI', 'ACEI', 'UN', 'DOAR', 'TINE',
'ALE', 'SAU', 'DINTRE', 'INTRE', 'CU', 'CE', 'VA', 'FI', 'ESTE', 'CAND', 'O',
'CINE', 'ACEASTA', 'CA', 'DAR', 'TO', 'WAS', 'YOUR', 'YOU', 'IS', 'ARE', 'IAR', 'FARA', 'ASTA', 'PE', 'TU',
'NU', 'MAI', 'NE', 'LE', 'INTR', 'CUM', 'E', 'FOR', 'SHE', 'IT', 'ESTI',
'THIS', 'THAT', 'HOW', 'CAN', 'T', 'MUST', 'BE', 'THE', 'AND', 'DO', 'SO', 'OR', 'ORI',
'WHO', 'WHAT', 'IF', 'OF', 'ON', 'I', 'WE', 'THEY', 'THEM', 'BUT', 'WHERE', 'BY', 'AN',
'MI', 'MADE', 'MY', 'ME', 'VOM', 'VOI', 'EI', 'CAT', 'AR', 'PUTEA', 'POTI', 'SUNTETI', 'INCA', 'STILL', 'NOI', 'L',
'MA', 'S', 'DUPA', 'AFTER', 'UNDER', 'SUB', 'NISTE', 'SOME', 'THOSE', 'HE', 'NO', 'TOO',
'FAC', 'MADE', 'MAKE', 'CEI', 'MOST', 'FACE', 'PENTRU', 'CAT', 'CATE', 'MUCH', 'MORE', 'MANY',
'SALE', 'TALE', 'TAU', 'HAS', 'SUNT', 'HIS', 'YOURS', 'ONLY', 'AS', 'TOATE', 'ALL', 'TOT', 'INCAT',
'WHICH', 'TI', 'ASA', 'LIKE', 'THESE', 'BECAUSE', 'UNOR', 'CACI', 'ELE', 'HAVE', 'HAVEN', 'TE',
'CEA', 'ELSE', 'IMI', 'ITI', 'SHOULD', 'COULD', 'NOT', 'EVEN', 'CHIAR', 'WHEN', 'CI', 'NE', 'NI',
'HER', 'OUR', 'ALTA', 'ANOTHER', 'OTHER', 'DECAT', 'ACELASI', 'SAME', 'AU', 'HAD', 'HAVEN', 'HASN',
'ALTE', 'ALT', 'OTHERS', 'CEEA', 'CEL', 'CELE', 'ALTE', 'DESPRE', 'ABOUT', 'ACELE', 'ACEL', 'ACEA',
'DECIT', 'WITH', '_', 'FATA', 'TOWARDS', 'AGAINST', 'CIND', 'DINSPRE', 'FOST', 'BEEN', 'ERA', 'DACA',
'EU', 'EL', 'HIM', 'EA', 'WILL', 'AM', 'CANNOT', 'BETWEEN', 'CAUSE', 'MAY', 'COULDN',
'In', 'La', 'Unei', 'Si', 'Sa', 'Se', 'De', 'Prin', 'Unde', 'Care', 'Al', 'Prea', 'Lui', 'Din', 'Ai', 'Unui',
'Acei', 'Un', 'Doar', 'Tine', 'Ale', 'Sau', 'Dintre', 'Intre', 'Cu', 'Ce', 'Va', 'Fi', 'Este', 'Cand', 'Cine', 'Aceasta', 'Ca',
'Dar', 'Ii', 'Iii', 'Iv', 'V', 'Vi', 'Vii', 'Viii', 'To', 'Was', 'Your', 'You', 'Is', 'Are', 'Iar', 'Fara', 'Asta', 'Pe', 'Tu',
'Nu', 'Mai', 'Ne', 'Le', 'Intr', 'Cum', 'For', 'She', 'It', 'Esti',
'This', 'That', 'How', 'Can', 'Must', 'Be', 'The', 'And', 'Do', 'So', 'Or', 'Ori',
'Who', 'What', 'If', 'Of', 'On', 'We', 'They', 'Them', 'But', 'Where', 'By', 'An',
'Mi', 'Made', 'My', 'Me', 'Vom', 'Voi', 'Ei', 'Cat', 'Ar', 'Putea', 'Poti', 'Sunteti', 'Inca', 'Still', 'Noi',
'Ma', 'Dupa', 'After', 'Under', 'Sub', 'Niste', 'Some', 'Those', 'He', 'No', 'Too',
'Fac', 'Made', 'Make', 'Cei', 'Most', 'Face', 'Pentru', 'Cat', 'Cate', 'Much', 'More', 'Many',
'Sale', 'Tale', 'Tau', 'Has', 'Sunt', 'His', 'Yours', 'Only', 'As', 'Toate', 'All', 'Tot', 'Incat',
'Which', 'Ti', 'Asa', 'Like', 'These', 'Because', 'Unor', 'Caci', 'Ele', 'Have', 'Haven', 'Te',
'Cea', 'Else', 'Imi', 'Iti', 'Should', 'Could', 'Not', 'Even', 'Chiar', 'When', 'Ci', 'Ne', 'Ni',
'Her', 'Our', 'Alta', 'Another', 'Other', 'Decat', 'Acelasi', 'Same', 'Au', 'Had', 'Haven', 'Hasn',
'Alte', 'Alt', 'Others', 'Ceea', 'Cel', 'Cele', 'Alte', 'Despre', 'About', 'Acele', 'Acel', 'Acea',
'Decit', 'With', 'Fata', 'Towards', 'Against', 'Cind', 'Dinspre', 'Fost', 'Been', 'Era', 'Daca',
'Eu', 'El', 'Him', 'Ea', 'Will', 'Am', 'Cannot', 'Between', 'Cause', 'May', 'Couldn', 'destul', 'enough',
'Destul', 'Enough', 'from', 'FROM', 'From', 'ia', 'Ia', 'IA'
]
#PATTERN_LINK = "<a href=\"{}\" target=\"_new\">{}</a>"
PATTERN_LINK = "<a href=\"{}\" class=\"color-bebe\" target=\"_new\">{}</a>"
'''
structura dictionar cuvinte
{
"cuvantul1": [lista_linkuri1],
"cuvantul2": [lista_linkuri2]
}
'''
CALE_FISIER_LINKURI = "d:\\Folder1\\LINKS\\links.txt"
# folosim DEF cand vrem sa definim o functie => un cuvant cheie in Python
# REGULA: def nume_functie(lista_argumente)
def preia_cuvinte_link(link):
cuvinte = link.split('.')[0] # [0] ia primul element iar daca pun [1] ia al doilea element
cuvinte = cuvinte.split('-')
cuvinte_ok = list()
for cuv in cuvinte:
if cuv not in LISTA_CUVINTE_LEGATURA:
cuvinte_ok.append(cuv)
return cuvinte_ok # am pus return fiindca voi avea nevoie de rezultatul functiei de mai sus
def preia_cuvinte_lista_linkuri(cale_fisier_linkuri):
dictionar_cuvinte_linkuri = dict()
with open(cale_fisier_linkuri, encoding='utf8') as fp:
lines = fp.readlines()
for line in lines:
# functia preia_cuvinte_link returneaza un rezultat care este salvat in variabila cuvinte_link
cuvinte_link = preia_cuvinte_link(line.strip())
for cuv in cuvinte_link:
if cuv in dictionar_cuvinte_linkuri.keys():
if SITE + line.strip() not in dictionar_cuvinte_linkuri[cuv]:
dictionar_cuvinte_linkuri[cuv].append(SITE + line.strip())
else:
dictionar_cuvinte_linkuri[cuv] = [SITE + line.strip()]
return dictionar_cuvinte_linkuri
def citeste_fisier_linie_cu_linie(cale_fisier):
with open(cale_fisier, encoding='utf8') as fp:
lines = fp.readlines()
count = 0
for line in lines:
print(count, line.strip())
count += 1
def read_text_from_file(file_path):
"""
Aceasta functie returneaza continutul unui fisier.
file_path: calea catre fisierul din care vrei sa citesti
"""
# with open(file_path, encoding='utf8') as f:
with open(file_path, encoding='utf8', errors='ignore') as f:
text = f.read()
return text
def write_to_file(text, file_path):
"""
Aceasta functie scrie un text intr-un fisier.
text: textul pe care vrei sa il scrii
file_path: calea catre fisierul in care vrei sa scrii
"""
with open(file_path, 'wb') as f:
f.write(text.encode('utf8', 'ignore'))
def introducere_linkuri(page, paragrafe):
tag = "<p class=\"mb-40px\">{}</p>"
text_start_final = ""
LINK_INTRODUS = 0
# incepem de la a doua treime din text
# start_paragraf = 1 # prima optiune
start_paragraf = int(len(paragrafe) / 3)
for paragraf in paragrafe[:start_paragraf]:
if len(re.findall(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', paragraf)) != 0:
paragraf = re.sub(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', r'\4', paragraf)
if len(re.findall(r'\n+', paragraf)) != 0:
# print(paragraf)
paragraf = re.sub(r'\n+', r'', paragraf)
text_start_final = text_start_final + '\n' + tag.format(paragraf)
for paragraf in paragrafe[start_paragraf:]:
if len(re.findall(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', paragraf)) != 0:
paragraf = re.sub(r'(<a )(.*?)(>)([\s\S]*?)(</a>)', r'\4', paragraf)
lista_cuvinte_gasite = list()
if LINK_INTRODUS == 0:
# gasim toate cuvintele din paragraful curent
cuvinte = re.findall(r' (?:\w|-*\!)+[ ,]', paragraf)
dictionar_linkuri = preia_cuvinte_lista_linkuri(CALE_FISIER_LINKURI)
for cuv in cuvinte:
cuv_fara_semne = cuv.replace(' ', '').replace(',', '')
if cuv_fara_semne in dictionar_linkuri.keys():
lista_cuvinte_gasite.append(cuv)
# lista de cuvinte gasite in paragraf, dar care se gasesc si in dictionar
lista_cuvinte_gasite = list(set(lista_cuvinte_gasite))
# daca s-au gasit cuvinte in paragraf, atunci adaugam link-ul in paragraf
if len(lista_cuvinte_gasite) > 0:
cuvant_random = random.sample(lista_cuvinte_gasite, 1)[0]
cuvant_random_fara_semne = cuvant_random.replace(' ', '').replace(',', '')
link_random = random.sample(dictionar_linkuri[cuvant_random_fara_semne], 1)[0]
# singur cuvant subliniat
'''
pattern = PATTERN_LINK.format(link_random, cuvant_random.strip())
paragraf = paragraf.replace(cuvant_random.strip(), pattern, 1)
LINK_INTRODUS = 1
'''
# doua cuvinte subliniate
expresie_regulata = cuvant_random.strip() + r' *\w+'
# print("expr: ", expresie_regulata)
urmatorul_cuvant = re.findall(expresie_regulata, paragraf)
if len(urmatorul_cuvant) == 0:
print("Nu am gasit urmatorul cuvant pe pagina {}!!!".format(page))
pattern = PATTERN_LINK.format(link_random, cuvant_random.strip())
paragraf = paragraf.replace(cuvant_random.strip(), pattern, 1)
LINK_INTRODUS = 1
else:
urmatorul_cuvant = re.findall(expresie_regulata, paragraf)[0]
pattern = PATTERN_LINK.format(link_random, urmatorul_cuvant)
paragraf = paragraf.replace(urmatorul_cuvant, pattern, 1)
LINK_INTRODUS = 1
paragraf = tag.format(paragraf)
if len(re.findall(r'\n+', paragraf)) != 0:
paragraphs[i] = re.sub(r'\n+', r'', paragraf)
text_start_final = text_start_final + '\n' + paragraf
if LINK_INTRODUS == 0:
print("Nu am introdus niciun link-ul pe pagina: {}.".format(page))
return text_start_final
# 1. Preluare site-uri de pe o anumita pagina (vezi variabila PAGE)
FOLDER_LOCAL = 'd:/Folder1/fisiere_html/de-convertit/1' # aici pui fisierele HTML -------------------------------
# DUPA RULAREA CODULUI, SALVAREA NOILOR FISIERE VA AVEA LOC IN d:\Folder1\fisiere_html_modificate\
# INCEPUT CHATGPT
page_text_pattern = re.compile('<div class="article-info clearfix">([\s\S]*?)</article>') # AICI PUI PRIMA LEGATURA DINTRE START si FINAL
img_tag_pattern = re.compile('<img[\s\S]*?>') # Acesta este regexul care va căuta toate tagurile <img>
html_tag_pattern = re.compile('<[^>]+>') # Acesta este noul pattern pentru eliminarea tagurilor HTML
link_replace_pattern = re.compile('<link rel=canonical href=(.*?)>') # pentru înlocuirea tagului <link rel=canonical href= >
script_pattern = re.compile('if\(typeof ez_ad_units!=\'undefined\'\){ez_ad_units.push\(\[\[728,90],\'drawandpaintforfun_com-box-3\',\'ezslot_6\',105,\'0\',\'0\'\]\]\);__ez_fad_position\(\'div-gpt-ad-drawandpaintforfun_com-box-3-0\'\);}')
page_text_pattern2 = re.compile('<div itemprop="description">([\s\S]*?)<h2 class="is-sr-only">Compartir</h2>')
page_text_pattern3 = re.compile('<div class="content">([\s\S]*?)<aside class="block widget widget--share">')
page_text_pattern4 = re.compile('<div class="article-post-content">([\s\S]*?)</article>')
paragraph_pattern = re.compile('<p.*?>([\s\S]*?)</p>')
counter_sterse = 0
INFORMATII_PAGINI = list()
for f in os.listdir(FOLDER_LOCAL):
if f.endswith('.html') or f.endswith('.htm'):
filepath = os.path.join(FOLDER_LOCAL, f)
page_html = read_text_from_file(filepath)
# Înlocuim tagul <link rel=canonical href= > cu <link rel="canonical" href=" " />
page_html = re.sub(link_replace_pattern, '<link rel="canonical" href="\g<1>" />', page_html)
# La acest punct, toate tagurile <img> vor fi înlocuite cu un punct
page_html = re.sub(img_tag_pattern, '.', page_html)
# La acest punct, scriptul specificat va fi înlocuit cu un punct
page_html = re.sub(script_pattern, '.', page_html)
# Obținem textul paginii
page_text = re.findall(page_text_pattern, page_html)
page_text2 = re.findall(page_text_pattern2, page_html)
page_text3 = re.findall(page_text_pattern3, page_html)
page_text4 = re.findall(page_text_pattern4, page_html)
if len(page_text) != 0 or len(page_text2) != 0 or len(page_text3) != 0 or len(page_text4) != 0:
if len(page_text) != 0:
page_text = page_text[0]
elif len(page_text2) != 0:
page_text = page_text2[0]
elif len(page_text3) != 0:
page_text = page_text3[0]
else:
page_text = page_text4[0]
# La acest punct avem 'page_text' care este textul din div-ul selectat
# Acum vom înlocui toate tagurile HTML cu un punct
page_text_no_html = re.sub(html_tag_pattern, '.', page_text)
# FINAL CHAPGPT
# inlocuim textul de tipul "text</p>" cu "<p class="mb-40px">text</p>"
page_text = re.sub(r'(^.*)(?!<p>)([\s\S]*?)(</p>)', r'<p class="mb-40px">\1\2\3', page_text)
# extragem paragrafele <p></p> si construim un text nou
paragraphs = re.findall(paragraph_pattern, page_text)
new_paragraphs = list()
if len(paragraphs) == 0:
continue
else:
for i in range(len(paragraphs)):
if '<br />' in paragraphs[i]:
if len(re.findall(r'^(.*?)(<br />)', paragraphs[i], flags=re.MULTILINE)) != 0:
paragraphs[i] = re.sub(r'^(.*?)(<br />)', r'\1</p>', paragraphs[i], flags=re.MULTILINE)
if len(re.findall(r'(^.*)(?!<p>)(.+?)(</p>)', paragraphs[i], flags=re.MULTILINE)) != 0:
paragraphs[i] = re.sub(r'(^.*)(?!<p>)(.+?)(</p>)', r'<p class="mb-40px">\1\2\3', paragraphs[i], flags=re.MULTILINE)
if len(re.findall(r'^(?!<p class="mb-40px">)(.+?)', paragraphs[i], flags=re.MULTILINE)) != 0:
paragraphs[i] = re.sub(r'^(?!<p class="mb-40px">)(.*)', r'<p class="mb-40px">\1</p>', paragraphs[i], flags=re.MULTILINE)
if len(re.findall(paragraph_pattern, paragraphs[i])) != 0:
for p in re.findall(paragraph_pattern, paragraphs[i]):
new_paragraphs.append(p)
else:
new_paragraphs.append(paragraphs[i])
# introducem link-urile
new_page_text = introducere_linkuri(f, new_paragraphs)
title_pattern = re.compile('<title>([\s\S]*?)</title>')
title_og_pattern = re.compile('<meta property="og:title" content=(.*?) />')
if len(re.findall(title_pattern, page_html)) != 0 or len(re.findall(title_og_pattern, page_html)) != 0:
if len(re.findall(title_pattern, page_html)) != 0:
page_title = re.findall(title_pattern, page_html)
else:
page_title = re.findall(title_og_pattern, page_html)
page_title = page_title[0]
# modificare cuvinte titlu
title_words = page_title.split(' ')
new_title_words = list()
for w in title_words:
if w.isupper():
new_title_words.append(w.lower().capitalize())
else:
new_title_words.append(w)
page_title = " ".join(new_title_words)
# description
description_pattern = re.compile('<meta name="description" content="([\s\S]*?)>')
description_og_pattern = re.compile('<meta property="og:description" content="([\s\S]*?)>')
text_description = 'MANCARE'
if len(re.findall(description_pattern, page_html)) == 0 and len(re.findall(description_og_pattern, page_html)) == 0:
print("AM STERS")
os.remove(filepath)
counter_sterse += 1
continue
elif len(re.findall(description_pattern, page_html)) != 0 and len(re.findall(description_og_pattern, page_html)) == 0:
print("CAZ 1")
text_description = re.findall(description_pattern, page_html)
text_description = text_description[0]
description_model = '<meta name="description" content="{}">'.format(text_description)
og_description_model = '<meta property="og:description" content="{}"/>'.format(text_description)
page_html = re.sub(r'(<meta name="description" content="[\s\S]*?" />)', description_model + '\n' + og_description_model, page_html)
elif len(re.findall(description_pattern, page_html)) == 0 and len(re.findall(description_og_pattern, page_html)) != 0:
print("CAZ II")
text_description = re.findall(description_og_pattern, page_html)
text_description = text_description[0]
description_model = '<meta name="description" content="{}">'.format(text_description)
og_description_model = '<meta property="og:description" content="{}"/>'.format(text_description)
page_html = re.sub(r'(<meta property="og:description" content=".*?" />)', description_model + '\n' + og_description_model, page_html)
else:
print("CAZ III")
text_description = re.findall(description_pattern, page_html)
text_description = text_description[0]
print("DESCCC: ", text_description)
print(page_title)
print(f)
# canonical
canonical_tag_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
canonical_og_tag_pattern = re.compile('<meta property="og:url" content="(.*?)" />')
if len(re.findall(canonical_tag_pattern, page_html)) != 0 or len(re.findall(canonical_og_tag_pattern, page_html)) != 0:
if len(re.findall(canonical_tag_pattern, page_html)) != 0:
canonical_tag = re.findall(canonical_tag_pattern, page_html)
else:
canonical_tag = re.findall(canonical_og_tag_pattern, page_html)
canonical_tag = canonical_tag[0]
# print("canonical: ", canonical_tag)
# adaugare nota finala
link_pattern = '<a href={} class="color-bebe" target="_new">{}</a>'.format(canonical_tag, canonical_tag)
new_page_text = new_page_text + '\n' + '<p class="mb-40px"><strong>* Surs&#259;:</strong> {} </p>'.format(link_pattern) + '\n'
# adaugare informatie
informatie = (page_title, text_description, canonical_tag, new_page_text)
INFORMATII_PAGINI.append(informatie)
else:
print("Pagina structura gresita - canonical: ", f)
continue
else:
print("Pagina structura gresita - title: ", f)
continue
else:
print("Pagina structura gresita - text: ", f)
continue
# page are structura: (page_title, page_description, canonical_tag, new_page_text)
def copiaza_continut_txt_html(page, cale_fisier_html): # astea sunt argumentele functiei, adica cand apelez functia
text_html = read_text_from_file(cale_fisier_html)
# aici e pattern-ul pentru expresia regex; (.*?) inseamna ca preia tot ce este intre tag-uri
# modifici expresia regulata in functie de ce tag dai ca argument pentru functie
articol_pattern = re.compile('<!-- ARTICOL START -->([\s\S]*?)<!-- ARTICOL FINAL -->[\s\S]*?')
text_articol = re.findall(articol_pattern, text_html)
if len(text_articol) != 0:
text_articol = text_articol[0]
text_html = text_html.replace(text_articol, page[3]) # pe indexul 3 sta new_page_text
else:
print("Fisier html fara ARTICOL START/FINAL.")
title_pattern = re.compile('<title>([\s\S]*?)</title>')
text_title = re.findall(title_pattern, text_html)
# 01.02.2022: inlocuire h3 cu text titlu (2) - Aici SCHIMBI TAGUL LA TITLUL ARTICOLULUI DIN PAGINA
h3_pattern = re.compile('<h3 class=\"font-weight-normal\"><a href=\"javascript:void\(0\)\" class=\"color-black\">(.*?)</a></h3>')
text_h3 = re.findall(h3_pattern, text_html)
if len(text_title) != 0:
text_title = text_title[0]
canonical_words = ''
if page[2].endswith('/'):
canonical_words = page[2].split('/')[-2]
else:
print("PAGE: ", page[2])
canonical_words = page[2].split('/')[-1] # daca nu se termina cu / sau .html, atunci sa imi ia ultima bucata dupa /
if '.' in canonical_words: # in cazul in care se termina cu .html, .htm
canonical_words = canonical_words.split('.')[0]
# creare nume nou link
new_file_name_fara_spatiu = canonical_words + '.html'
# inlocuire text titlu cu primele 10 cuvinte
text_html = text_html.replace(text_title, page[0]) # page[0] titlul
# 01.02.2022: inlocuire h3 cu text titlu (2)
if len(text_h3) != 0:
text_h3 = text_h3[0]
text_html = text_html.replace(text_h3, page[0])
else:
print("Fisierul nu are tag-ul h3.")
# 07.02.2022: inlocuire text canonical tag
canonical_tag_pattern = re.compile('<link rel="canonical" href="(.*?)" />')
canonical_tag = re.findall(canonical_tag_pattern, text_html)
if len(canonical_tag) != 0:
canonical_tag = canonical_tag[0]
#text_html = text_html.replace(canonical_tag, new_file_name_fara_spatiu)
# daca trebuie sa pui si "https://neculaifantanaru.com/" in fata comentezi linia de mai sus si o decomentezi pe cea de jos
text_html = text_html.replace(canonical_tag, "https://trinketbox.ro/" + new_file_name_fara_spatiu)
else:
print("Fisier fara tag canonical")
else:
print("Fisier html fara titlu.")
description_pattern = re.compile('<meta name="description" content="(.*?)">')
text_description = re.findall(description_pattern, text_html)
if len(text_description) != 0:
text_description = text_description[0]
# print("text description: ", text_description)
text_html = text_html.replace(text_description, page[1]) # description pe pozitia 1
else:
print("Fisier html fara description.")
file_path = os.path.dirname(cale_fisier_html) + "\\" + "fisiere_html_modificate" + "\\" + new_file_name_fara_spatiu # in acest folder se duc fisierele FACUTE
write_to_file(text_html, file_path)
# print("Fisier: ", new_file_name_fara_spatiu)
print("Scriere efectuata cu succes.")
def creare_fisiere_html(cale_fisier_html):
"""
Functia itereaza printr-un folder care contine fisiere txt si creeaza fisiere html corespunzatoare
"""
count = 0
for page in INFORMATII_PAGINI:
copiaza_continut_txt_html(page, cale_fisier_html)
count += 1
print("Numarul de fisiere modificate: ", count)
def main():
creare_fisiere_html("d:\\Folder1\\index_trinketbox.html") # aici este indexul model de la trinketbox.ro
# DUPA RULAREA CODULUI, SALVAREA NOILOR FISIERE VA AVEA LOC IN d:\Folder1\fisiere_html_modificate\
# dictionar_cuvinte = preia_cuvinte_lista_linkuri(CALE_FISIER_LINKURI)
# print(dictionar_cuvinte)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment