Last active
July 19, 2024 15:02
-
-
Save pyoneerC/0030be5deca5e8f324053a2ba1dbc87b to your computer and use it in GitHub Desktop.
Web Scraper to get the prices of an item in MercadoLibre Argentina and plot a histogram of the prices, shows USD conversion, and shows an image of the first item found. Can search N pages (8 default)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to get the prices of an item in MercadoLibre Argentina and plot a histogram of the prices | |
from matplotlib.offsetbox import OffsetImage, AnnotationBbox | |
import matplotlib.ticker as ticker | |
import matplotlib.pyplot as plt | |
from bs4 import BeautifulSoup | |
from PIL import Image | |
import numpy as np | |
import requests | |
import datetime | |
import re | |
import io | |
API_URL = "https://dolarapi.com/v1/dolares/blue" | |
NUMBER_OF_PAGES = 5 | |
def get_exchange_rate(): | |
"""Fetch the current exchange rate from the API.""" | |
try: | |
response = requests.get(API_URL) | |
response.raise_for_status() # Raise an exception if the request failed | |
return response.json()['venta'] | |
except requests.exceptions.RequestException as e: | |
print(f"Error: {e}") | |
return None | |
venta_dolar = get_exchange_rate() | |
def get_prices(item): | |
"""Fetch the prices of the given item from MercadoLibre.""" | |
prices_list = [] | |
image_urls = [] | |
for i in range(NUMBER_OF_PAGES): | |
start_item = i * 50 + 1 | |
url = f'https://listado.mercadolibre.com.ar/{item}_Desde_{start_item}_NoIndex_True' | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.content, 'html.parser') | |
prices = soup.find_all('span', class_='andes-money-amount__fraction') | |
prices_list.extend([int(re.sub(r'\D', '', price.text)) for price in prices]) | |
images = soup.find_all('img', class_='poly-component__picture poly-component__picture--square') | |
image_urls.extend([image['src'] for image in images]) | |
except requests.exceptions.RequestException as e: | |
if response.status_code == 404: | |
print(f'Error: {response.status_code} - {response.reason}') | |
print('Please scan less pages!') | |
else: | |
print(f"Error: {e}") | |
return None, None, None | |
return prices_list, url, image_urls | |
def format_x(value, tick_number): | |
"""Format the x-axis values.""" | |
return "{:,}".format(int(value)) | |
def plot_prices(prices_list, item, url, image_urls): | |
"""Plot a histogram of the prices.""" | |
plt.figure(figsize=(10, 5)) | |
plt.hist(prices_list, bins=20, color='lightblue', edgecolor='black') | |
# Plain numbers in the x-axis | |
plt.ticklabel_format(style='plain', axis='x') | |
formatter = ticker.FuncFormatter(format_x) | |
plt.gca().xaxis.set_major_formatter(formatter) | |
y_position = plt.gca().get_ylim()[1] * 0.05 | |
median = np.median(prices_list) | |
# Adjust the x position offset according to the median price | |
if median > 70000: | |
x_pos_offset = 10000 | |
elif median > 50000: | |
x_pos_offset = 3500 | |
elif median > 20000: | |
x_pos_offset = 2000 | |
elif median > 10000: | |
x_pos_offset = 1000 | |
else: | |
x_pos_offset = 500 | |
plt.xlabel('Price in ARS') | |
plt.ylabel('Frequency') | |
current_date = datetime.date.today().strftime('%d/%m/%Y') | |
base_url = 'https://listado.mercadolibre.com.ar/' + item | |
plt.title('Histogram of ' + item.replace('-', ' ').upper() + ' prices in MercadoLibre Argentina ' + | |
'(' + current_date + ')' + '\n' + | |
'Number of items indexed: ' + str(len(prices_list)) + ' (' + str(NUMBER_OF_PAGES) + ' pages)' + '\n' + | |
'URL: ' + base_url + '\n') | |
# plot of the mean, median, max, min and standard deviation of the selected item | |
std_dev = np.std(prices_list) | |
avg_price = np.mean(prices_list) | |
median_price = np.median(prices_list) | |
max_price = max(prices_list) | |
min_price = min(prices_list) | |
plt.axvline(median_price, color='red', linestyle='solid', linewidth=1) | |
plt.text(median_price + x_pos_offset, y_position, | |
'Median: ' + "{:,}".format(int(median_price)) + ' ARS' + ' (' + "{:,.0f}".format( | |
median_price / venta_dolar) + ' USD)', rotation=90, color='red') | |
plt.axvline(avg_price, color='purple', linestyle='solid', linewidth=1) | |
plt.text(avg_price + x_pos_offset, y_position, | |
'Avg: ' + "{:,}".format(int(avg_price)) + ' ARS' + ' (' + "{:,.0f}".format( | |
avg_price / venta_dolar) + ' USD)', rotation=90, color='purple') | |
plt.axvline(max_price, color='blue', linestyle='dashed', linewidth=1) | |
plt.text(max_price + x_pos_offset, y_position, | |
'Max: ' + "{:,}".format(int(max_price)) + ' ARS' + ' (' + "{:,.0f}".format( | |
max_price / venta_dolar) + ' USD)', rotation=90) | |
plt.axvline(min_price, color='blue', linestyle='dashed', linewidth=1) | |
plt.text(min_price + x_pos_offset, y_position, | |
'Min: ' + "{:,}".format(int(min_price)) + ' ARS' + ' (' + "{:,.0f}".format( | |
min_price / venta_dolar) + ' USD)', rotation=90) | |
plt.axvline(avg_price + std_dev, color='black', linestyle='dotted', linewidth=3) | |
plt.axvline(np.percentile(prices_list, 25), color='green', linestyle='dashed', linewidth=2) | |
plt.text(np.percentile(prices_list, 25) + x_pos_offset, y_position, | |
'25th percentile: ' + "{:,}".format( | |
int(np.percentile(prices_list, 25))) + ' ARS' + ' (' + "{:,.0f}".format( | |
np.percentile(prices_list, 25) / venta_dolar) + ' USD)', rotation=90, color='green') | |
plt.axvline(avg_price - std_dev, color='black', linestyle='dotted', linewidth=3) | |
plt.legend(['Median', 'Avg', 'Max', 'Min', 'Std Dev', '25th percentile'], loc='upper right') | |
if len(image_urls) > 0 and len(prices_list) > 0: | |
img = Image.open(io.BytesIO(requests.get(image_urls[0]).content)) | |
ylim = plt.gca().get_ylim() | |
ytop = ylim[1] - 0.1 * (ylim[1] - ylim[0]) | |
imagebox = OffsetImage(img, zoom=0.2) | |
ab = AnnotationBbox(imagebox, (max_price, ytop), frameon=False) | |
plt.gca().add_artist(ab) | |
plt.grid(True) | |
plt.tight_layout() | |
plt.show() | |
def print_statistics(prices_list, item, url): | |
"""Print the statistics of the prices.""" | |
std_dev = np.std(prices_list) | |
mean_price = np.mean(prices_list) | |
median_price = np.median(prices_list) | |
max_price = max(prices_list) | |
min_price = min(prices_list) | |
coeff_var = (std_dev / mean_price) * 100 | |
print(f'Statistics of \'{item.replace("-", " ").upper()}\' prices in MercadoLibre Argentina:') | |
print('Number of items: ', len(prices_list)) | |
print('Dollar price: ', "{:,}".format(int(venta_dolar)) + ' ARS') | |
print('url: ', url) | |
print('-' * 50) | |
print('Median price: ', "{:,}".format(int(median_price)) + ' ARS') | |
print('Avg price: ', "{:,}".format(int(mean_price)) + ' ARS') | |
print('Max price: ', "{:,}".format(int(max_price)) + ' ARS') | |
print('Min price: ', "{:,}".format(int(min_price)) + ' ARS') | |
print('-' * 50) | |
print('Prices in USD:') | |
print('Median price: ', "{:.2f}".format(median_price / venta_dolar) + ' USD') | |
print('Avg price: ', "{:.2f}".format(mean_price / venta_dolar) + ' USD') | |
print('Max price: ', "{:.2f}".format(max_price / venta_dolar) + ' USD') | |
print('Min price: ', "{:.2f}".format(min_price / venta_dolar) + ' USD') | |
def main(): | |
"""Main function to execute the script.""" | |
item = input('Enter the item to scan: ') | |
global NUMBER_OF_PAGES | |
NUMBER_OF_PAGES = max(1, min(15, int(input('Enter the number of pages to scan (default 5, min 1, max 15): ') or NUMBER_OF_PAGES))) | |
print(f'Scanning the first {NUMBER_OF_PAGES} \'{item}\' pages in MercadoLibre Argentina...') | |
print('Please wait...') | |
while len(item) < 3: | |
print('Please enter an item with at least 3 characters!') | |
item = input('Enter the item to search: ') | |
item = item.replace(' ', '-').lower() | |
exchange_rate = get_exchange_rate() | |
if exchange_rate is None: | |
print("Failed to fetch exchange rate. Exiting.") | |
return | |
prices_list, url, image_urls = get_prices(item) | |
if prices_list is None or url is None: | |
print("Failed to fetch prices. Exiting.") | |
return | |
plot_prices(prices_list, item, url, image_urls) | |
print_statistics(prices_list, item, url) | |
if __name__ == "__main__": | |
main() |
Author
pyoneerC
commented
Jul 19, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment