Created
November 28, 2024 15:42
-
-
Save vhsu/2ff414c0684903b9b28b7acca2bb52ef to your computer and use it in GitHub Desktop.
Image sitemap generate form html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Generate Image Sitemap for a Single HTML Page | |
This script reads an HTML file, extracts all the image URLs from the <img> tags, and generates an XML sitemap compliant with Google's image sitemap protocol. The generated sitemap can help improve the visibility of images in search engines. | |
Usage: | |
- Ensure you have the 'beautifulsoup4' package installed: `pip install beautifulsoup4` | |
- Set the `html_file` variable to the path of your HTML file. | |
- Set the `page_url` variable to the URL of the page where the images are located. | |
- Run the script to generate 'sitemap.xml' containing the image sitemap. | |
Parameters: | |
- `html_file` (str): Path to the HTML file to parse. | |
- `page_url` (str): The URL of the page to be included in the sitemap. | |
- `output_file` (str, optional): The filename for the output sitemap. Defaults to 'sitemap.xml'. | |
Example: | |
html_file = "index.html" | |
page_url = "https://example.com" | |
generate_image_sitemap_single_page(html_file, page_url) | |
""" | |
from bs4 import BeautifulSoup | |
def generate_image_sitemap_single_page(html_file, page_url, output_file="sitemap.xml"): | |
# Read the HTML file | |
with open(html_file, 'r', encoding='utf-8') as file: | |
soup = BeautifulSoup(file, 'html.parser') | |
# Find all image tags and extract their URLs | |
images = soup.find_all('img') | |
image_urls = [img['src'] for img in images if 'src' in img.attrs] | |
# Build the sitemap XML structure | |
sitemap = '<?xml version="1.0" encoding="UTF-8"?>\n' | |
sitemap += '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"\n' | |
sitemap += ' xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">\n' | |
sitemap += ' <url>\n' | |
sitemap += f' <loc>{page_url}</loc>\n' | |
for image_url in image_urls: | |
sitemap += f' <image:image>\n' | |
sitemap += f' <image:loc>{image_url}</image:loc>\n' | |
sitemap += f' </image:image>\n' | |
sitemap += ' </url>\n' | |
sitemap += '</urlset>' | |
# Write the sitemap to a file | |
with open(output_file, 'w', encoding='utf-8') as file: | |
file.write(sitemap) | |
print(f"Sitemap saved to {output_file}") | |
# Example usage | |
html_file = "index.html" # Path to your HTML file | |
page_url = "https://example.com" # The page URL | |
generate_image_sitemap_single_page(html_file, page_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment