Last active
November 24, 2023 04:21
-
-
Save dmattera/ef11cb37c31d732f9e5d2347eea876c2 to your computer and use it in GitHub Desktop.
A simple way set custom indentation levels when using BeautifulSoup's soup.prettify()
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python == 3.6.2 | |
# bs4 == 4.6.0 | |
# The current version of BeautifulSoup's soup.prettify() function only allows for | |
# an indentation level = to 1 space. This is a simple, reliable way to allow for the use | |
# of any indentation level you wish. | |
import requests | |
from bs4 import BeautifulSoup | |
url = "https://www.google.com" | |
r = requests.get(url) | |
soup = BeautifulSoup(r.content, "html.parser") | |
def soup_prettify2(soup, desired_indent): #where desired_indent is number of spaces as an int() | |
pretty_soup = str() | |
previous_indent = 0 | |
for line in soup.prettify().split("\n"): # iterate over each line of a prettified soup | |
current_indent = str(line).find("<") # returns the index for the opening html tag '<' | |
# which is also represents the number of spaces in the lines indentation | |
if current_indent == -1 or current_indent > previous_indent + 2: | |
current_indent = previous_indent + 1 | |
# str.find() will equal -1 when no '<' is found. This means the line is some kind | |
# of text or script instead of an HTML element and should be treated as a child | |
# of the previous line. also, current_indent should never be more than previous + 1. | |
previous_indent = current_indent | |
pretty_soup += write_new_line(line, current_indent, desired_indent) | |
return pretty_soup | |
def write_new_line(line, current_indent, desired_indent): | |
new_line = "" | |
spaces_to_add = (current_indent * desired_indent) - current_indent | |
if spaces_to_add > 0: | |
for i in range(spaces_to_add): | |
new_line += " " | |
new_line += str(line) + "\n" | |
return new_line | |
pretty_soup = soup_prettify2(soup, desired_indent=4) | |
print(pretty_soup) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
breaks on
<pre>