salvatorecapolupo · May 23, 2025 17:41
diff --git a/potatura.py b/potatura.py
 from urllib.parse import urlparse
 import csv
 import re
 import requests
 import xml.etree.ElementTree as ET
 import random

 def estrai_url_sitemap(url):
    urls = []

    def process_sitemap(sitemap_url):
        try:
            response = requests.get(sitemap_url)
            response.raise_for_status()
            root = ET.fromstring(response.content)
            # Determina se è un index o una sitemap normale
            if root.tag.endswith('sitemapindex'):
                for sitemap in root.findall('.//{*}sitemap'):
                    loc = sitemap.find('{*}loc')
                    if loc is not None and loc.text:
                        process_sitemap(loc.text)
            elif root.tag.endswith('urlset'):
                for url_tag in root.findall('.//{*}url'):
                    loc = url_tag.find('{*}loc')
                    if loc is not None and loc.text:
                        urls.append(loc.text)
        except Exception as e:
            print(f"Errore durante il processamento di {sitemap_url}: {e}")

    process_sitemap(url)
    return urls

 # # Esempio d'uso:
 # if __name__ == "__main__":
 #     sitemap_urls = estrai_url_sitemap("https://www.trovalost.it/sitemap_index.xml")
 #     for u in sitemap_urls:
 #         print(u)


 def estrai_url_da_csv(file_path):
    """
    Estrae tutte le URL presenti in tutte le celle di un file CSV.

    :param file_path: percorso al file .csv
    :return: lista di URL trovate
    """
    urls = []
    url_pattern = re.compile(r'https?://\S+')
    with open(file_path, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            for cell in row:
                # trova tutte le sottostringhe che corrispondono a http:// o https://
                for match in url_pattern.findall(cell):
                    urls.append(match)
    return urls

 # Esempio d’uso:
 # if __name__ == "__main__":
 #     file_csv = 'porta_traffico-trovalost.csv'
 #     lista_urls = estrai_url_da_csv(file_csv)
 #     # stampa in formato “array testo”
 #     print(lista_urls)

 # —————————————————————————————————————————————————————————
 # FUNZIONI DI ESTRAZIONE (riutilizzano il tuo codice)
 # —————————————————————————————————————————————————————————

 def estrai_url_sitemap(sitemap_url):
    urls = []
    def _process(surl):
        r = requests.get(surl)
        r.raise_for_status()
        root = ET.fromstring(r.content)
        if root.tag.endswith('sitemapindex'):
            for sm in root.findall('.//{*}sitemap'):
                loc = sm.find('{*}loc')
                if loc is not None and loc.text:
                    _process(loc.text)
        elif root.tag.endswith('urlset'):
            for u in root.findall('.//{*}url'):
                loc = u.find('{*}loc')
                if loc is not None and loc.text:
                    urls.append(loc.text)
    _process(sitemap_url)
    return urls

 def estrai_url_da_csv(file_path):
    urls = []
    pattern = re.compile(r'https?://\S+')
    with open(file_path, newline='', encoding='utf-8') as f:
        for row in csv.reader(f):
            for cell in row:
                urls += pattern.findall(cell or '')
    return urls

 # —————————————————————————————————————————————————————————
 # FUNZIONI D’INTERAZIONE CON WP-REST
 # —————————————————————————————————————————————————————————

 def get_post_id_by_url(wp_api_base, auth, url, post_type='posts'):
    """
    Tenta di recuperare l'ID di un post/page dato il suo URL completo.
    Si basa sullo slug (ultima parte dell'URL).
    """
    slug = urlparse(url).path.strip('/').split('/')[-1]
    endpoint = f"{wp_api_base}/wp/v2/{post_type}"
    resp = requests.get(endpoint, params={'slug': slug}, auth=auth)
    resp.raise_for_status()
    data = resp.json()
    if data:
        return data[0]['id']
    return None

 def set_status(wp_api_base, auth, post_id, status='pending', post_type='posts'):
    """
    Imposta lo status di un post/page via REST.
    """
    endpoint = f"{wp_api_base}/wp/v2/{post_type}/{post_id}"
    payload = {'status': status}
    resp = requests.post(endpoint, json=payload, auth=auth)
    resp.raise_for_status()
    return resp.json()

 # —————————————————————————————————————————————————————————
 # FUNZIONE PRINCIPALE
 # —————————————————————————————————————————————————————————

 def sync_suspend_urls(
    sitemap_url: str,
    csv_path: str,
    wp_api_base: str,
    auth: tuple,
    post_type: str = 'posts',
    status: str = 'pending',
    limit: int = None
 ):
    """
    1. Legge tutti gli URL dalla sitemap.
    2. Legge tutti gli URL dal CSV.
    3. Trova quelli da sospendere = sitemap - csv.
    4. Limita a `limit` elementi (se non None).
    5. Per ciascuno, trova l'ID WP e ne cambia lo status.
    6. Ritorna un report di tuple (url, result).
    """
    sitemap_urls = estrai_url_sitemap(sitemap_url)
    csv_urls     = estrai_url_da_csv(csv_path)
    
    to_suspend = list(set(sitemap_urls) - set(csv_urls))
    if limit:
        to_suspend = to_suspend[:limit]
    
    # print ( to_suspend )

    report = []
    for url in to_suspend:
        try:
            pid = get_post_id_by_url(wp_api_base, auth, url, post_type)
            if not pid:
                report.append((url, 'SKIPPED: post non trovato'))
                continue
            res = set_status(wp_api_base, auth, pid, status, post_type)
            report.append((url, f"OK: status impostato su '{status}' (ID {pid})"))
            print(url, ' OK ')
        except Exception as e:
            report.append((url, f"ERROR: {e}"))
            print(url, ' KO ')
    
    return report

 # —————————————————————————————————————————————————————————
 # ESEMPIO D’USO
 # —————————————————————————————————————————————————————————

 if __name__ == "__main__":
    # Configura qui i parametri:
    SITEMAP_URL = "https://www.sito.it/sitemap_index.xml"
    CSV_PATH    = "porta_traffico-sito.csv"
    WP_API_BASE = "https://www.sito.it/wp-json"
    # auth = (username, application_password)
    AUTH        = ("ciuppaflesx", "9imU xu04 chfU Sx1h EP0c lufc")
    
    fernando = random.randint(5,500) 
    print ( fernando )

    # Primo test solo su limit URL:
    report = sync_suspend_urls(
        sitemap_url=SITEMAP_URL,
        csv_path=CSV_PATH,
        wp_api_base=WP_API_BASE,
        auth=AUTH,
        post_type='posts',    # o 'pages'
        status='pending',     # o 'draft'
        limit = fernando
    )
    
    # Stampiamo il report
    for url, msg in report:
        print(f"{msg:40} → {url}")
	from urllib.parse import urlparse
	import csv
	import re
	import requests
	import xml.etree.ElementTree as ET
	import random

	def estrai_url_sitemap(url):
	urls = []

	def process_sitemap(sitemap_url):
	try:
	response = requests.get(sitemap_url)
	response.raise_for_status()
	root = ET.fromstring(response.content)
	# Determina se è un index o una sitemap normale
	if root.tag.endswith('sitemapindex'):
	for sitemap in root.findall('.//{*}sitemap'):
	loc = sitemap.find('{*}loc')
	if loc is not None and loc.text:
	process_sitemap(loc.text)
	elif root.tag.endswith('urlset'):
	for url_tag in root.findall('.//{*}url'):
	loc = url_tag.find('{*}loc')
	if loc is not None and loc.text:
	urls.append(loc.text)
	except Exception as e:
	print(f"Errore durante il processamento di {sitemap_url}: {e}")

	process_sitemap(url)
	return urls

	# # Esempio d'uso:
	# if __name__ == "__main__":
	# sitemap_urls = estrai_url_sitemap("https://www.trovalost.it/sitemap_index.xml")
	# for u in sitemap_urls:
	# print(u)


	def estrai_url_da_csv(file_path):
	"""
	Estrae tutte le URL presenti in tutte le celle di un file CSV.

	:param file_path: percorso al file .csv
	:return: lista di URL trovate
	"""
	urls = []
	url_pattern = re.compile(r'https?://\S+')
	with open(file_path, newline='', encoding='utf-8') as f:
	reader = csv.reader(f)
	for row in reader:
	for cell in row:
	# trova tutte le sottostringhe che corrispondono a http:// o https://
	for match in url_pattern.findall(cell):
	urls.append(match)
	return urls

	# Esempio d’uso:
	# if __name__ == "__main__":
	# file_csv = 'porta_traffico-trovalost.csv'
	# lista_urls = estrai_url_da_csv(file_csv)
	# # stampa in formato “array testo”
	# print(lista_urls)

	# —————————————————————————————————————————————————————————
	# FUNZIONI DI ESTRAZIONE (riutilizzano il tuo codice)
	# —————————————————————————————————————————————————————————

	def estrai_url_sitemap(sitemap_url):
	urls = []
	def _process(surl):
	r = requests.get(surl)
	r.raise_for_status()
	root = ET.fromstring(r.content)
	if root.tag.endswith('sitemapindex'):
	for sm in root.findall('.//{*}sitemap'):
	loc = sm.find('{*}loc')
	if loc is not None and loc.text:
	_process(loc.text)
	elif root.tag.endswith('urlset'):
	for u in root.findall('.//{*}url'):
	loc = u.find('{*}loc')
	if loc is not None and loc.text:
	urls.append(loc.text)
	_process(sitemap_url)
	return urls

	def estrai_url_da_csv(file_path):
	urls = []
	pattern = re.compile(r'https?://\S+')
	with open(file_path, newline='', encoding='utf-8') as f:
	for row in csv.reader(f):
	for cell in row:
	urls += pattern.findall(cell or '')
	return urls

	# —————————————————————————————————————————————————————————
	# FUNZIONI D’INTERAZIONE CON WP-REST
	# —————————————————————————————————————————————————————————

	def get_post_id_by_url(wp_api_base, auth, url, post_type='posts'):
	"""
	Tenta di recuperare l'ID di un post/page dato il suo URL completo.
	Si basa sullo slug (ultima parte dell'URL).
	"""
	slug = urlparse(url).path.strip('/').split('/')[-1]
	endpoint = f"{wp_api_base}/wp/v2/{post_type}"
	resp = requests.get(endpoint, params={'slug': slug}, auth=auth)
	resp.raise_for_status()
	data = resp.json()
	if data:
	return data[0]['id']
	return None

	def set_status(wp_api_base, auth, post_id, status='pending', post_type='posts'):
	"""
	Imposta lo status di un post/page via REST.
	"""
	endpoint = f"{wp_api_base}/wp/v2/{post_type}/{post_id}"
	payload = {'status': status}
	resp = requests.post(endpoint, json=payload, auth=auth)
	resp.raise_for_status()
	return resp.json()

	# —————————————————————————————————————————————————————————
	# FUNZIONE PRINCIPALE
	# —————————————————————————————————————————————————————————

	def sync_suspend_urls(
	sitemap_url: str,
	csv_path: str,
	wp_api_base: str,
	auth: tuple,
	post_type: str = 'posts',
	status: str = 'pending',
	limit: int = None
	):
	"""
	1. Legge tutti gli URL dalla sitemap.
	2. Legge tutti gli URL dal CSV.
	3. Trova quelli da sospendere = sitemap - csv.
	4. Limita a `limit` elementi (se non None).
	5. Per ciascuno, trova l'ID WP e ne cambia lo status.
	6. Ritorna un report di tuple (url, result).
	"""
	sitemap_urls = estrai_url_sitemap(sitemap_url)
	csv_urls = estrai_url_da_csv(csv_path)

	to_suspend = list(set(sitemap_urls) - set(csv_urls))
	if limit:
	to_suspend = to_suspend[:limit]

	# print ( to_suspend )

	report = []
	for url in to_suspend:
	try:
	pid = get_post_id_by_url(wp_api_base, auth, url, post_type)
	if not pid:
	report.append((url, 'SKIPPED: post non trovato'))
	continue
	res = set_status(wp_api_base, auth, pid, status, post_type)
	report.append((url, f"OK: status impostato su '{status}' (ID {pid})"))
	print(url, ' OK ')
	except Exception as e:
	report.append((url, f"ERROR: {e}"))
	print(url, ' KO ')

	return report

	# —————————————————————————————————————————————————————————
	# ESEMPIO D’USO
	# —————————————————————————————————————————————————————————

	if __name__ == "__main__":
	# Configura qui i parametri:
	SITEMAP_URL = "https://www.sito.it/sitemap_index.xml"
	CSV_PATH = "porta_traffico-sito.csv"
	WP_API_BASE = "https://www.sito.it/wp-json"
	# auth = (username, application_password)
	AUTH = ("ciuppaflesx", "9imU xu04 chfU Sx1h EP0c lufc")

	fernando = random.randint(5,500)
	print ( fernando )

	# Primo test solo su limit URL:
	report = sync_suspend_urls(
	sitemap_url=SITEMAP_URL,
	csv_path=CSV_PATH,
	wp_api_base=WP_API_BASE,
	auth=AUTH,
	post_type='posts', # o 'pages'
	status='pending', # o 'draft'
	limit = fernando
	)

	# Stampiamo il report
	for url, msg in report:
	print(f"{msg:40} → {url}")