Skip to content

Instantly share code, notes, and snippets.

@bjornblissing
Last active October 9, 2025 09:35
Show Gist options
  • Save bjornblissing/ee90f045445e61900f0eaa4b21fd9fab to your computer and use it in GitHub Desktop.
Save bjornblissing/ee90f045445e61900f0eaa4b21fd9fab to your computer and use it in GitHub Desktop.
Lista dagens luncher i Ebbepark
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import re
from datetime import date
from PIL import Image
import easyocr
import io
import warnings
import os
# Suppress EasyOCR GPU warnings
os.environ['EASYOCR_VERBOSE'] = '0'
warnings.filterwarnings("ignore", category=UserWarning, module="torch")
def strip_trailing_empty_lines(s: str) -> str:
lines = s.splitlines()
while lines and lines[-1].strip() == "":
lines.pop()
return "\n".join(lines)
def laluna():
print("\n=============")
print("La Luna")
print("=============")
base_url = "http://www.lalunat1.se"
# First, get the main page to find the lunch link
try:
response = requests.get(base_url)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching main page: {e}")
return
soup = BeautifulSoup(response.content, "html.parser")
# Find the lunch link in the navigation
lunch_link = None
# Look for links containing "Lunch" text
for link in soup.find_all("a", href=True):
if link.get_text(strip=True).lower() == "lunch":
lunch_link = link["href"]
break
if not lunch_link:
print("Could not find lunch page link.")
return
# Construct full URL if it's a relative link
if lunch_link.startswith("/"):
lunch_url = base_url + lunch_link
elif lunch_link.startswith("http"):
lunch_url = lunch_link
else:
lunch_url = base_url + "/" + lunch_link
# Now get the lunch page
try:
response = requests.get(lunch_url)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching lunch page: {e}")
return
soup = BeautifulSoup(response.content, "html.parser")
# First, handle double <br> tags as dish separators
# Convert <br><br> patterns to dish separators, but only if followed by a capital letter
html_content = str(soup)
html_content = re.sub(r"<br[^>]*>\s*<br[^>]*>\s*([A-ZÅÄÖ])", r"__DISH_SEPARATOR__\1", html_content, flags=re.IGNORECASE)
soup = BeautifulSoup(html_content, "html.parser")
# Replace remaining single <br> tags with spaces
for br in soup.find_all(["br", "br/"]):
br.replace_with(" ")
# Also handle self-closing br tags and other potential line break elements
for element in soup.find_all():
if element.name in ["br", "hr"] or (element.name and element.name.lower() == "br"):
element.replace_with(" ")
# Replace dish separator spans with a special marker
# Look for spans with class "wysiwyg-font-size-24" that contain a <b> tag
for span in soup.find_all("span", class_="wysiwyg-font-size-24"):
if span.find("b"): # Only if it contains a <b> tag
# Check if the span text starts with a capital letter
span_text = span.get_text(strip=True)
if span_text and span_text[0].isupper():
span.insert_before("__DISH_SEPARATOR__")
# Keep the span content but mark it as a new dish
# Try different selectors to find content
selectors = [
"div[class*='widget-content-']",
".widget-content",
"div[class*='content']",
"p", "div"
]
lunch_text = ""
for selector in selectors:
elements = soup.select(selector)
for element in elements:
# Get text with separator to handle missing spaces between elements
text = element.get_text(separator=" ", strip=True)
if text and any(keyword in text.lower() for keyword in ["vecka", "måndag", "tisdag", "onsdag", "torsdag", "fredag", "dagens lunch", "special", "vegetarisk"]):
if text not in lunch_text: # Avoid duplicates
lunch_text += text + "\n"
if lunch_text.strip(): # If we found content, break
break
if not lunch_text.strip():
print("Could not find lunch menu content.")
return
# Clean up string
replacements = {
u"\u00A0": " ", # non-breaking space
u"\u200B": " ", # zero width space
}
for pattern, repl in replacements.items():
lunch_text = lunch_text.replace(pattern, repl)
# First, handle the dish separators
lunch_text = re.sub(r"__DISH_SEPARATOR__", r"\n__SECOND_DISH__", lunch_text)
# Collapse lines with misplaced line breaks, but preserve important separators
lunch_text = re.sub(r"(\S)[ \t]*\n[ \t]*(?!__SECOND_DISH__|Måndag|Tisdag|Onsdag|Torsdag|Fredag|Veckans)(\S)", r"\1 \2", lunch_text)
lunch_text = re.sub(r"[ \t]{2,}", " ", lunch_text)
lunch_text = re.sub(r"^[ ]+", "", lunch_text, flags=re.MULTILINE)
# Normalize headers with a single newline after each day header
day_headers = [
"Måndag", "Tisdag", "Onsdag", "Torsdag", "Fredag",
"Veckans sallad", "Veckans special", "Veckans vegetariska"
]
for header in day_headers:
lunch_text = re.sub(fr"{header}[\s:]*", f"\n{header}\n", lunch_text, flags=re.IGNORECASE)
# Add new line after week number
lunch_text = re.sub(r"(Vecka \d+)", r"\1\n", lunch_text, flags=re.IGNORECASE)
# Add newlines after "Dagens lunch"
lunch_text = re.sub(r"(Dagens lunch)", r"\1\n", lunch_text, flags=re.IGNORECASE)
# Clean up extra line breaks that might have been introduced
lunch_text = re.sub(r"\n\s*\n\s*__SECOND_DISH__", r"\n__SECOND_DISH__", lunch_text)
# Split additional info at the end and remove "Luncha med oss!"
lunch_text = re.sub(r"(vegetarisk|sallad)(.*?)(Luncha med oss!)", r"\1\2", lunch_text, flags=re.IGNORECASE)
lunch_text = re.sub(r"(ris) (Luncha med oss!)", r"\1", lunch_text, flags=re.IGNORECASE)
lunch_text = re.sub(r"(Luncha med oss!)(.*?)(Alla ordinarie)", r"\n\3", lunch_text, flags=re.IGNORECASE)
lunch_text = re.sub(r"(lunchmenyn)(.*?)(Vi serverar)", r"\1\n\3", lunch_text, flags=re.IGNORECASE)
# Add line break before "Alla ordinarie pizzor" if it doesn't already have one
lunch_text = re.sub(r"([^\n])(Alla ordinarie)", r"\1\n\2", lunch_text, flags=re.IGNORECASE)
# Insert a tab character before each non-day header line
lunch_text = "".join([s for s in lunch_text.splitlines(True) if s.strip("\r\n")])
# Split into lines and process each line for consistent indentation
lines = lunch_text.split('\n')
processed_lines = []
for line in lines:
line = line.strip()
if line:
# Skip the "Luncha med oss!" line
if line == "Luncha med oss!":
continue
# Check if this is a header line (should not be indented)
elif re.match(r'^(Måndag|Tisdag|Onsdag|Torsdag|Fredag|Veckans sallad|Veckans special|Veckans vegetariska|Vecka \d+|Dagens lunch)$', line):
processed_lines.append(line)
elif line.startswith('__SECOND_DISH__'):
# Handle second dish marker - replace with tab and add any remaining text
remaining_text = line.replace('__SECOND_DISH__', '').strip()
if remaining_text:
processed_lines.append(f'\t{remaining_text}')
else:
processed_lines.append('\t')
else:
# This should be indented content
if not line.startswith('\t'):
processed_lines.append(f'\t{line}')
else:
processed_lines.append(line)
lunch_text = '\n'.join(processed_lines)
lunch_text = strip_trailing_empty_lines(lunch_text)
print(lunch_text)
def get_lazlos_week_menu(week_tag):
lunch_menu = week_tag.find_next(id=re.compile(r'^fdm-menu-\d+$'))
if not lunch_menu:
print("Could not find lunch menu on the page.")
return
days = lunch_menu.find_all("ul")
for day in days:
day_name_tag = day.find("h3")
if day_name_tag:
print(f"\n{day_name_tag.get_text(strip=True)}")
food_items = day.find_all("li", class_="fdm-item")
for item in food_items:
food_title_tag = item.find("p", class_="fdm-item-title")
if food_title_tag:
print(f"\t{food_title_tag.get_text(strip=True)}")
food_content_div = item.find("div", class_="fdm-item-content")
if food_content_div:
subtitles = food_content_div.find_all("p")
for subtitle in subtitles:
subtitle_text = subtitle.get_text(strip=True)
if subtitle_text:
print(f"\t {subtitle_text}")
def laszlos_krog():
print("\n=============")
print("Laszlos Krog")
print("=============")
url = "https://www.laszloskrog.se/ebbepark/"
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching page: {e}")
return
soup = BeautifulSoup(response.content, "html.parser")
week_tags = soup.find_all("p", class_="vecka")
current_week_num = date.today().isocalendar().week
for week_tag in week_tags:
strong_tag = week_tag.find("strong")
if not strong_tag:
continue
week_text = strong_tag.get_text(strip=True)
match = re.match(r"VECKA\s+(\d+)", week_text, flags=re.IGNORECASE)
if match:
week_num = int(match.group(1))
if week_num == current_week_num:
get_lazlos_week_menu(week_tag)
return # Stop after finding the correct week
print("Current week's menu not found.")
def don_luigi():
print("\n=============")
print("Don Luigi")
print("=============")
url = "https://www.donluigi.se/lunchmeny/"
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching page: {e}")
return
soup = BeautifulSoup(response.content, "html.parser")
# Find the entry-content div
entry_content = soup.find("div", class_="entry-content")
if not entry_content:
print("Could not find entry-content div.")
return
# Find the image within the entry-content div
img_tag = entry_content.find("img")
if not img_tag:
print("Could not find image in entry-content div.")
return
img_src = img_tag.get("src")
if not img_src:
print("Could not find image source.")
return
# Make sure we have the full URL
if img_src.startswith("//"):
img_src = "https:" + img_src
elif img_src.startswith("/"):
img_src = "https://www.donluigi.se" + img_src
try:
# Download the image
img_response = requests.get(img_src)
img_response.raise_for_status()
# Open the image with PIL
image = Image.open(io.BytesIO(img_response.content))
# Use EasyOCR to extract text from the image
# Initialize EasyOCR reader with Swedish and English languages
# Suppress the GPU warning by redirecting stdout temporarily
import sys
from io import StringIO
old_stdout = sys.stdout
sys.stdout = StringIO()
try:
reader = easyocr.Reader(['sv', 'en'], verbose=False) # Swedish and English
finally:
sys.stdout = old_stdout
# Extract text from the image
results = reader.readtext(img_response.content)
# Combine all detected text
text_parts = []
for (bbox, text, confidence) in results:
if confidence > 0.5: # Only include text with reasonable confidence
text_parts.append(text)
text = ' '.join(text_parts)
if not text.strip():
print("Could not extract text from the menu image.")
return
# Clean up the extracted text
# Replace multiple spaces with single spaces and clean up
text = re.sub(r'\s+', ' ', text.strip())
# Remove everything from "*Det går bra att beställa" onwards
after_work_pos = text.find('*Det går bra att beställa')
if after_work_pos != -1:
text = text[:after_work_pos].strip()
# Fix common OCR artifacts
text = text.replace('_', ', ') # OCR sometimes reads commas as underscores
#text = text.replace('írån', 'från') # Fix Swedish characters
#text = text.replace('Falaíel', 'Falafel') # Fix OCR error
#text = text.replace('49krlextra', '49kr extra') # Fix price formatting
# Try to identify and format different sections
formatted_text = text
# Remove prices (3 or more digits)
formatted_text = re.sub(r'\b\d{3,}\b', '', formatted_text)
# Add line breaks before key sections
formatted_text = re.sub(r'(LUNCHMENY|Veckans|Pasta|Sallad|Husman)', r'\n\1', formatted_text)
formatted_text = re.sub(r'(MAN - FRE|MÅN FRE)', r'\n\1', formatted_text)
# Add line breaks after specific headers and indent the following content
formatted_text = re.sub(r'(Veckans Pinsa)\s+([A-ZÅÄÖ])', r'\1\n\t\2', formatted_text)
formatted_text = re.sub(r'(Pasta)\s+(Pasta|[A-ZÅÄÖ])', r'\1\n\t\2', formatted_text)
formatted_text = re.sub(r'(Sallad)\s+([A-ZÅÄÖ])', r'\1\n\t\2', formatted_text)
formatted_text = re.sub(r'(Husman)\s+([A-ZÅÄÖ])', r'\1\n\t\2', formatted_text)
# Clean up multiple newlines
formatted_text = re.sub(r'\n\s*\n', '\n', formatted_text)
# Split into lines for further processing
lines = formatted_text.split('\n')
final_lines = []
for line in lines:
line = line.strip()
if line:
# Check if this is a main section header (should not be indented)
if any(keyword in line.upper() for keyword in ['LUNCHMENY', 'MAN - FRE', 'MÅN FRE']):
# This is a main section header
if final_lines: # Add spacing before headers (except first)
final_lines.append("")
final_lines.append(line)
# Check if this is a menu item header (should not be indented)
elif line in ['Veckans Pinsa', 'Pasta', 'Sallad', 'Husman']:
if final_lines: # Add spacing before headers (except first)
final_lines.append("")
final_lines.append(line)
else:
# Regular text, indent if not already indented
if not line.startswith('\t'):
final_lines.append(f"\t{line}")
else:
final_lines.append(line)
final_text = '\n'.join(final_lines)
if final_text.strip():
print(final_text)
else:
print("No readable menu content found in the image.")
except requests.RequestException as e:
print(f"Error downloading image: {e}")
except Exception as e:
print(f"Error processing image: {e}")
if __name__ == "__main__":
laszlos_krog()
laluna()
don_luigi()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment