Last active
May 23, 2025 17:41
-
-
Save salvatorecapolupo/21e8840c18c93e3444ee29423e27124b to your computer and use it in GitHub Desktop.
WP URL Suspension Sync A lightweight Python script that compares all URLs from a WordPress sitemap against those exported in a Google Search Console CSV, and automatically sets to pending any posts/pages whose URLs are missing from the CSV—ideal for batch-testing with a configurable limit before full rollout.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import urlparse | |
import csv | |
import re | |
import requests | |
import xml.etree.ElementTree as ET | |
import random | |
def estrai_url_sitemap(url): | |
urls = [] | |
def process_sitemap(sitemap_url): | |
try: | |
response = requests.get(sitemap_url) | |
response.raise_for_status() | |
root = ET.fromstring(response.content) | |
# Determina se è un index o una sitemap normale | |
if root.tag.endswith('sitemapindex'): | |
for sitemap in root.findall('.//{*}sitemap'): | |
loc = sitemap.find('{*}loc') | |
if loc is not None and loc.text: | |
process_sitemap(loc.text) | |
elif root.tag.endswith('urlset'): | |
for url_tag in root.findall('.//{*}url'): | |
loc = url_tag.find('{*}loc') | |
if loc is not None and loc.text: | |
urls.append(loc.text) | |
except Exception as e: | |
print(f"Errore durante il processamento di {sitemap_url}: {e}") | |
process_sitemap(url) | |
return urls | |
# # Esempio d'uso: | |
# if __name__ == "__main__": | |
# sitemap_urls = estrai_url_sitemap("https://www.trovalost.it/sitemap_index.xml") | |
# for u in sitemap_urls: | |
# print(u) | |
def estrai_url_da_csv(file_path): | |
""" | |
Estrae tutte le URL presenti in tutte le celle di un file CSV. | |
:param file_path: percorso al file .csv | |
:return: lista di URL trovate | |
""" | |
urls = [] | |
url_pattern = re.compile(r'https?://\S+') | |
with open(file_path, newline='', encoding='utf-8') as f: | |
reader = csv.reader(f) | |
for row in reader: | |
for cell in row: | |
# trova tutte le sottostringhe che corrispondono a http:// o https:// | |
for match in url_pattern.findall(cell): | |
urls.append(match) | |
return urls | |
# Esempio d’uso: | |
# if __name__ == "__main__": | |
# file_csv = 'porta_traffico-trovalost.csv' | |
# lista_urls = estrai_url_da_csv(file_csv) | |
# # stampa in formato “array testo” | |
# print(lista_urls) | |
# ————————————————————————————————————————————————————————— | |
# FUNZIONI DI ESTRAZIONE (riutilizzano il tuo codice) | |
# ————————————————————————————————————————————————————————— | |
def estrai_url_sitemap(sitemap_url): | |
urls = [] | |
def _process(surl): | |
r = requests.get(surl) | |
r.raise_for_status() | |
root = ET.fromstring(r.content) | |
if root.tag.endswith('sitemapindex'): | |
for sm in root.findall('.//{*}sitemap'): | |
loc = sm.find('{*}loc') | |
if loc is not None and loc.text: | |
_process(loc.text) | |
elif root.tag.endswith('urlset'): | |
for u in root.findall('.//{*}url'): | |
loc = u.find('{*}loc') | |
if loc is not None and loc.text: | |
urls.append(loc.text) | |
_process(sitemap_url) | |
return urls | |
def estrai_url_da_csv(file_path): | |
urls = [] | |
pattern = re.compile(r'https?://\S+') | |
with open(file_path, newline='', encoding='utf-8') as f: | |
for row in csv.reader(f): | |
for cell in row: | |
urls += pattern.findall(cell or '') | |
return urls | |
# ————————————————————————————————————————————————————————— | |
# FUNZIONI D’INTERAZIONE CON WP-REST | |
# ————————————————————————————————————————————————————————— | |
def get_post_id_by_url(wp_api_base, auth, url, post_type='posts'): | |
""" | |
Tenta di recuperare l'ID di un post/page dato il suo URL completo. | |
Si basa sullo slug (ultima parte dell'URL). | |
""" | |
slug = urlparse(url).path.strip('/').split('/')[-1] | |
endpoint = f"{wp_api_base}/wp/v2/{post_type}" | |
resp = requests.get(endpoint, params={'slug': slug}, auth=auth) | |
resp.raise_for_status() | |
data = resp.json() | |
if data: | |
return data[0]['id'] | |
return None | |
def set_status(wp_api_base, auth, post_id, status='pending', post_type='posts'): | |
""" | |
Imposta lo status di un post/page via REST. | |
""" | |
endpoint = f"{wp_api_base}/wp/v2/{post_type}/{post_id}" | |
payload = {'status': status} | |
resp = requests.post(endpoint, json=payload, auth=auth) | |
resp.raise_for_status() | |
return resp.json() | |
# ————————————————————————————————————————————————————————— | |
# FUNZIONE PRINCIPALE | |
# ————————————————————————————————————————————————————————— | |
def sync_suspend_urls( | |
sitemap_url: str, | |
csv_path: str, | |
wp_api_base: str, | |
auth: tuple, | |
post_type: str = 'posts', | |
status: str = 'pending', | |
limit: int = None | |
): | |
""" | |
1. Legge tutti gli URL dalla sitemap. | |
2. Legge tutti gli URL dal CSV. | |
3. Trova quelli da sospendere = sitemap - csv. | |
4. Limita a `limit` elementi (se non None). | |
5. Per ciascuno, trova l'ID WP e ne cambia lo status. | |
6. Ritorna un report di tuple (url, result). | |
""" | |
sitemap_urls = estrai_url_sitemap(sitemap_url) | |
csv_urls = estrai_url_da_csv(csv_path) | |
to_suspend = list(set(sitemap_urls) - set(csv_urls)) | |
if limit: | |
to_suspend = to_suspend[:limit] | |
# print ( to_suspend ) | |
report = [] | |
for url in to_suspend: | |
try: | |
pid = get_post_id_by_url(wp_api_base, auth, url, post_type) | |
if not pid: | |
report.append((url, 'SKIPPED: post non trovato')) | |
continue | |
res = set_status(wp_api_base, auth, pid, status, post_type) | |
report.append((url, f"OK: status impostato su '{status}' (ID {pid})")) | |
print(url, ' OK ') | |
except Exception as e: | |
report.append((url, f"ERROR: {e}")) | |
print(url, ' KO ') | |
return report | |
# ————————————————————————————————————————————————————————— | |
# ESEMPIO D’USO | |
# ————————————————————————————————————————————————————————— | |
if __name__ == "__main__": | |
# Configura qui i parametri: | |
SITEMAP_URL = "https://www.sito.it/sitemap_index.xml" | |
CSV_PATH = "porta_traffico-sito.csv" | |
WP_API_BASE = "https://www.sito.it/wp-json" | |
# auth = (username, application_password) | |
AUTH = ("ciuppaflesx", "9imU xu04 chfU Sx1h EP0c lufc") | |
fernando = random.randint(5,500) | |
print ( fernando ) | |
# Primo test solo su limit URL: | |
report = sync_suspend_urls( | |
sitemap_url=SITEMAP_URL, | |
csv_path=CSV_PATH, | |
wp_api_base=WP_API_BASE, | |
auth=AUTH, | |
post_type='posts', # o 'pages' | |
status='pending', # o 'draft' | |
limit = fernando | |
) | |
# Stampiamo il report | |
for url, msg in report: | |
print(f"{msg:40} → {url}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment