thatguynef · March 7, 2023 11:56
diff --git a/web_scraping.py b/web_scraping.py
 # Youtube video: https://youtu.be/dwCe9l_geK4
 import requests
 import json
 import os
 import unicodedata
 from bs4 import BeautifulSoup

 urls = ['https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/moonridge-cali-bear-cabin/', 
       'https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/switzerland', 
       "https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/pines"
       ]

 # Remove the trailing slash if present
 urls = [url.rstrip('/') for url in urls]

 for url in urls:
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')

  # Extract the content within the HTML paragraph tags
  paragraphs = soup.select('#node-vr-listing-full-group-vr-property-desc p, #node-vr-listing-full-group-vr-property-desc ul li')
  content = ''.join([paragraph.get_text() + ' ' for paragraph in paragraphs])

  # Remove special Unicode characters
  content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode()

  # Extract the file name from the URL
  file_name = os.path.basename(url)

  # Create a dictionary with the data
  data = {'content': content}

  # Save the data to a .json file
  with open(file_name + '.json', 'w') as outfile:
    json.dump(data, outfile, separators=(',', ':'), indent=2)
	# Youtube video: https://youtu.be/dwCe9l_geK4
	import requests
	import json
	import os
	import unicodedata
	from bs4 import BeautifulSoup

	urls = ['https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/moonridge-cali-bear-cabin/',
	'https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/switzerland',
	"https://www.bigbearcoolcabins.com/big-bear-cabin-rentals/pines"
	]

	# Remove the trailing slash if present
	urls = [url.rstrip('/') for url in urls]

	for url in urls:
	response = requests.get(url)
	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract the content within the HTML paragraph tags
	paragraphs = soup.select('#node-vr-listing-full-group-vr-property-desc p, #node-vr-listing-full-group-vr-property-desc ul li')
	content = ''.join([paragraph.get_text() + ' ' for paragraph in paragraphs])

	# Remove special Unicode characters
	content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode()

	# Extract the file name from the URL
	file_name = os.path.basename(url)

	# Create a dictionary with the data
	data = {'content': content}

	# Save the data to a .json file
	with open(file_name + '.json', 'w') as outfile:
	json.dump(data, outfile, separators=(',', ':'), indent=2)
No results found