Skip to content

Instantly share code, notes, and snippets.

@salvatorecapolupo
Last active May 23, 2025 17:41
Show Gist options
  • Save salvatorecapolupo/21e8840c18c93e3444ee29423e27124b to your computer and use it in GitHub Desktop.
Save salvatorecapolupo/21e8840c18c93e3444ee29423e27124b to your computer and use it in GitHub Desktop.
WP URL Suspension Sync A lightweight Python script that compares all URLs from a WordPress sitemap against those exported in a Google Search Console CSV, and automatically sets to pending any posts/pages whose URLs are missing from the CSV—ideal for batch-testing with a configurable limit before full rollout.
from urllib.parse import urlparse
import csv
import re
import requests
import xml.etree.ElementTree as ET
import random
def estrai_url_sitemap(url):
urls = []
def process_sitemap(sitemap_url):
try:
response = requests.get(sitemap_url)
response.raise_for_status()
root = ET.fromstring(response.content)
# Determina se è un index o una sitemap normale
if root.tag.endswith('sitemapindex'):
for sitemap in root.findall('.//{*}sitemap'):
loc = sitemap.find('{*}loc')
if loc is not None and loc.text:
process_sitemap(loc.text)
elif root.tag.endswith('urlset'):
for url_tag in root.findall('.//{*}url'):
loc = url_tag.find('{*}loc')
if loc is not None and loc.text:
urls.append(loc.text)
except Exception as e:
print(f"Errore durante il processamento di {sitemap_url}: {e}")
process_sitemap(url)
return urls
# # Esempio d'uso:
# if __name__ == "__main__":
# sitemap_urls = estrai_url_sitemap("https://www.trovalost.it/sitemap_index.xml")
# for u in sitemap_urls:
# print(u)
def estrai_url_da_csv(file_path):
"""
Estrae tutte le URL presenti in tutte le celle di un file CSV.
:param file_path: percorso al file .csv
:return: lista di URL trovate
"""
urls = []
url_pattern = re.compile(r'https?://\S+')
with open(file_path, newline='', encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
for cell in row:
# trova tutte le sottostringhe che corrispondono a http:// o https://
for match in url_pattern.findall(cell):
urls.append(match)
return urls
# Esempio d’uso:
# if __name__ == "__main__":
# file_csv = 'porta_traffico-trovalost.csv'
# lista_urls = estrai_url_da_csv(file_csv)
# # stampa in formato “array testo”
# print(lista_urls)
# —————————————————————————————————————————————————————————
# FUNZIONI DI ESTRAZIONE (riutilizzano il tuo codice)
# —————————————————————————————————————————————————————————
def estrai_url_sitemap(sitemap_url):
urls = []
def _process(surl):
r = requests.get(surl)
r.raise_for_status()
root = ET.fromstring(r.content)
if root.tag.endswith('sitemapindex'):
for sm in root.findall('.//{*}sitemap'):
loc = sm.find('{*}loc')
if loc is not None and loc.text:
_process(loc.text)
elif root.tag.endswith('urlset'):
for u in root.findall('.//{*}url'):
loc = u.find('{*}loc')
if loc is not None and loc.text:
urls.append(loc.text)
_process(sitemap_url)
return urls
def estrai_url_da_csv(file_path):
urls = []
pattern = re.compile(r'https?://\S+')
with open(file_path, newline='', encoding='utf-8') as f:
for row in csv.reader(f):
for cell in row:
urls += pattern.findall(cell or '')
return urls
# —————————————————————————————————————————————————————————
# FUNZIONI D’INTERAZIONE CON WP-REST
# —————————————————————————————————————————————————————————
def get_post_id_by_url(wp_api_base, auth, url, post_type='posts'):
"""
Tenta di recuperare l'ID di un post/page dato il suo URL completo.
Si basa sullo slug (ultima parte dell'URL).
"""
slug = urlparse(url).path.strip('/').split('/')[-1]
endpoint = f"{wp_api_base}/wp/v2/{post_type}"
resp = requests.get(endpoint, params={'slug': slug}, auth=auth)
resp.raise_for_status()
data = resp.json()
if data:
return data[0]['id']
return None
def set_status(wp_api_base, auth, post_id, status='pending', post_type='posts'):
"""
Imposta lo status di un post/page via REST.
"""
endpoint = f"{wp_api_base}/wp/v2/{post_type}/{post_id}"
payload = {'status': status}
resp = requests.post(endpoint, json=payload, auth=auth)
resp.raise_for_status()
return resp.json()
# —————————————————————————————————————————————————————————
# FUNZIONE PRINCIPALE
# —————————————————————————————————————————————————————————
def sync_suspend_urls(
sitemap_url: str,
csv_path: str,
wp_api_base: str,
auth: tuple,
post_type: str = 'posts',
status: str = 'pending',
limit: int = None
):
"""
1. Legge tutti gli URL dalla sitemap.
2. Legge tutti gli URL dal CSV.
3. Trova quelli da sospendere = sitemap - csv.
4. Limita a `limit` elementi (se non None).
5. Per ciascuno, trova l'ID WP e ne cambia lo status.
6. Ritorna un report di tuple (url, result).
"""
sitemap_urls = estrai_url_sitemap(sitemap_url)
csv_urls = estrai_url_da_csv(csv_path)
to_suspend = list(set(sitemap_urls) - set(csv_urls))
if limit:
to_suspend = to_suspend[:limit]
# print ( to_suspend )
report = []
for url in to_suspend:
try:
pid = get_post_id_by_url(wp_api_base, auth, url, post_type)
if not pid:
report.append((url, 'SKIPPED: post non trovato'))
continue
res = set_status(wp_api_base, auth, pid, status, post_type)
report.append((url, f"OK: status impostato su '{status}' (ID {pid})"))
print(url, ' OK ')
except Exception as e:
report.append((url, f"ERROR: {e}"))
print(url, ' KO ')
return report
# —————————————————————————————————————————————————————————
# ESEMPIO D’USO
# —————————————————————————————————————————————————————————
if __name__ == "__main__":
# Configura qui i parametri:
SITEMAP_URL = "https://www.sito.it/sitemap_index.xml"
CSV_PATH = "porta_traffico-sito.csv"
WP_API_BASE = "https://www.sito.it/wp-json"
# auth = (username, application_password)
AUTH = ("ciuppaflesx", "9imU xu04 chfU Sx1h EP0c lufc")
fernando = random.randint(5,500)
print ( fernando )
# Primo test solo su limit URL:
report = sync_suspend_urls(
sitemap_url=SITEMAP_URL,
csv_path=CSV_PATH,
wp_api_base=WP_API_BASE,
auth=AUTH,
post_type='posts', # o 'pages'
status='pending', # o 'draft'
limit = fernando
)
# Stampiamo il report
for url, msg in report:
print(f"{msg:40} → {url}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment