Last active
August 25, 2025 15:33
-
-
Save maaduukaar/70bdaaed431ab9eb8af4d729e4e183df to your computer and use it in GitHub Desktop.
Script to check HTML pages for specific keywords and export results to Excel.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import os | |
import argparse | |
import re | |
# CLI arguments | |
parser = argparse.ArgumentParser(description="Check WordPress posts for keywords.") | |
parser.add_argument("-w", "--whole", action="store_true", help="Match whole words only") | |
args = parser.parse_args() | |
# read links from links.txt | |
with open("links.txt", "r", encoding="utf-8") as f: | |
links = [line.strip() for line in f if line.strip()] | |
# read keywords from keywords.txt and convert to lowercase | |
with open("keywords.txt", "r", encoding="utf-8") as f: | |
keywords = [line.strip().lower() for line in f if line.strip()] | |
results = [] | |
for i, url in enumerate(links, start=1): | |
print(f"\n[{i}/{len(links)}] Checking: {url}") | |
try: | |
response = requests.get(url, timeout=10) | |
status_code = response.status_code | |
content_length = len(response.content) | |
print(f" → Status code: {status_code}") | |
print(f" → Content size: {content_length} bytes") | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, "html.parser") | |
# get plain text from the article | |
text = soup.get_text(separator=" ").lower() | |
# keyword search | |
found = [] | |
for word in keywords: | |
if args.whole: | |
# match whole word only | |
if re.search(rf"\b{re.escape(word)}\b", text): | |
found.append(word) | |
else: | |
# substring match | |
if word in text: | |
found.append(word) | |
if found: | |
print(f" → Found: {', '.join(found)}") | |
else: | |
print(" → No keywords found") | |
results.append({ | |
"URL": url, | |
"Status code": status_code, | |
"Content size (bytes)": content_length, | |
"Found keywords": ", ".join(found) if found else "—" | |
}) | |
except Exception as e: | |
print(f" → Error: {e}") | |
results.append({ | |
"URL": url, | |
"Status code": "Error", | |
"Content size (bytes)": "—", | |
"Found keywords": f"Error: {e}" | |
}) | |
# check if result.xlsx already exists, if yes -> create a new one with suffix | |
output_file = "result.xlsx" | |
if os.path.exists(output_file): | |
base, ext = os.path.splitext(output_file) | |
counter = 1 | |
while os.path.exists(f"{base}_{counter}{ext}"): | |
counter += 1 | |
output_file = f"{base}_{counter}{ext}" | |
# save results to Excel | |
df = pd.DataFrame(results) | |
df.to_excel(output_file, index=False) | |
print(f"\nDone ✅ Results saved to {output_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
HTML Page Keyword Checker
Short description
Script to check HTML pages for specific keywords and export results to Excel.
Full description
This Python script automates the process of checking web pages for specific keywords.
Features
links.txt
(one per line).keywords.txt
(one per line, case-insensitive)."hop"
will match"hopper"
).-w
or--whole
) – matches only exact words (e.g."hop"
will not match"hopper"
).result.xlsx
).result_1.xlsx
,result_2.xlsx
).Output (Excel columns)
Usage