Last active
October 9, 2025 09:35
-
-
Save bjornblissing/ee90f045445e61900f0eaa4b21fd9fab to your computer and use it in GitHub Desktop.
Lista dagens luncher i Ebbepark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| from datetime import date | |
| from PIL import Image | |
| import easyocr | |
| import io | |
| import warnings | |
| import os | |
| # Suppress EasyOCR GPU warnings | |
| os.environ['EASYOCR_VERBOSE'] = '0' | |
| warnings.filterwarnings("ignore", category=UserWarning, module="torch") | |
| def strip_trailing_empty_lines(s: str) -> str: | |
| lines = s.splitlines() | |
| while lines and lines[-1].strip() == "": | |
| lines.pop() | |
| return "\n".join(lines) | |
| def laluna(): | |
| print("\n=============") | |
| print("La Luna") | |
| print("=============") | |
| base_url = "http://www.lalunat1.se" | |
| # First, get the main page to find the lunch link | |
| try: | |
| response = requests.get(base_url) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f"Error fetching main page: {e}") | |
| return | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Find the lunch link in the navigation | |
| lunch_link = None | |
| # Look for links containing "Lunch" text | |
| for link in soup.find_all("a", href=True): | |
| if link.get_text(strip=True).lower() == "lunch": | |
| lunch_link = link["href"] | |
| break | |
| if not lunch_link: | |
| print("Could not find lunch page link.") | |
| return | |
| # Construct full URL if it's a relative link | |
| if lunch_link.startswith("/"): | |
| lunch_url = base_url + lunch_link | |
| elif lunch_link.startswith("http"): | |
| lunch_url = lunch_link | |
| else: | |
| lunch_url = base_url + "/" + lunch_link | |
| # Now get the lunch page | |
| try: | |
| response = requests.get(lunch_url) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f"Error fetching lunch page: {e}") | |
| return | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # First, handle double <br> tags as dish separators | |
| # Convert <br><br> patterns to dish separators, but only if followed by a capital letter | |
| html_content = str(soup) | |
| html_content = re.sub(r"<br[^>]*>\s*<br[^>]*>\s*([A-ZÅÄÖ])", r"__DISH_SEPARATOR__\1", html_content, flags=re.IGNORECASE) | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| # Replace remaining single <br> tags with spaces | |
| for br in soup.find_all(["br", "br/"]): | |
| br.replace_with(" ") | |
| # Also handle self-closing br tags and other potential line break elements | |
| for element in soup.find_all(): | |
| if element.name in ["br", "hr"] or (element.name and element.name.lower() == "br"): | |
| element.replace_with(" ") | |
| # Replace dish separator spans with a special marker | |
| # Look for spans with class "wysiwyg-font-size-24" that contain a <b> tag | |
| for span in soup.find_all("span", class_="wysiwyg-font-size-24"): | |
| if span.find("b"): # Only if it contains a <b> tag | |
| # Check if the span text starts with a capital letter | |
| span_text = span.get_text(strip=True) | |
| if span_text and span_text[0].isupper(): | |
| span.insert_before("__DISH_SEPARATOR__") | |
| # Keep the span content but mark it as a new dish | |
| # Try different selectors to find content | |
| selectors = [ | |
| "div[class*='widget-content-']", | |
| ".widget-content", | |
| "div[class*='content']", | |
| "p", "div" | |
| ] | |
| lunch_text = "" | |
| for selector in selectors: | |
| elements = soup.select(selector) | |
| for element in elements: | |
| # Get text with separator to handle missing spaces between elements | |
| text = element.get_text(separator=" ", strip=True) | |
| if text and any(keyword in text.lower() for keyword in ["vecka", "måndag", "tisdag", "onsdag", "torsdag", "fredag", "dagens lunch", "special", "vegetarisk"]): | |
| if text not in lunch_text: # Avoid duplicates | |
| lunch_text += text + "\n" | |
| if lunch_text.strip(): # If we found content, break | |
| break | |
| if not lunch_text.strip(): | |
| print("Could not find lunch menu content.") | |
| return | |
| # Clean up string | |
| replacements = { | |
| u"\u00A0": " ", # non-breaking space | |
| u"\u200B": " ", # zero width space | |
| } | |
| for pattern, repl in replacements.items(): | |
| lunch_text = lunch_text.replace(pattern, repl) | |
| # First, handle the dish separators | |
| lunch_text = re.sub(r"__DISH_SEPARATOR__", r"\n__SECOND_DISH__", lunch_text) | |
| # Collapse lines with misplaced line breaks, but preserve important separators | |
| lunch_text = re.sub(r"(\S)[ \t]*\n[ \t]*(?!__SECOND_DISH__|Måndag|Tisdag|Onsdag|Torsdag|Fredag|Veckans)(\S)", r"\1 \2", lunch_text) | |
| lunch_text = re.sub(r"[ \t]{2,}", " ", lunch_text) | |
| lunch_text = re.sub(r"^[ ]+", "", lunch_text, flags=re.MULTILINE) | |
| # Normalize headers with a single newline after each day header | |
| day_headers = [ | |
| "Måndag", "Tisdag", "Onsdag", "Torsdag", "Fredag", | |
| "Veckans sallad", "Veckans special", "Veckans vegetariska" | |
| ] | |
| for header in day_headers: | |
| lunch_text = re.sub(fr"{header}[\s:]*", f"\n{header}\n", lunch_text, flags=re.IGNORECASE) | |
| # Add new line after week number | |
| lunch_text = re.sub(r"(Vecka \d+)", r"\1\n", lunch_text, flags=re.IGNORECASE) | |
| # Add newlines after "Dagens lunch" | |
| lunch_text = re.sub(r"(Dagens lunch)", r"\1\n", lunch_text, flags=re.IGNORECASE) | |
| # Clean up extra line breaks that might have been introduced | |
| lunch_text = re.sub(r"\n\s*\n\s*__SECOND_DISH__", r"\n__SECOND_DISH__", lunch_text) | |
| # Split additional info at the end and remove "Luncha med oss!" | |
| lunch_text = re.sub(r"(vegetarisk|sallad)(.*?)(Luncha med oss!)", r"\1\2", lunch_text, flags=re.IGNORECASE) | |
| lunch_text = re.sub(r"(ris) (Luncha med oss!)", r"\1", lunch_text, flags=re.IGNORECASE) | |
| lunch_text = re.sub(r"(Luncha med oss!)(.*?)(Alla ordinarie)", r"\n\3", lunch_text, flags=re.IGNORECASE) | |
| lunch_text = re.sub(r"(lunchmenyn)(.*?)(Vi serverar)", r"\1\n\3", lunch_text, flags=re.IGNORECASE) | |
| # Add line break before "Alla ordinarie pizzor" if it doesn't already have one | |
| lunch_text = re.sub(r"([^\n])(Alla ordinarie)", r"\1\n\2", lunch_text, flags=re.IGNORECASE) | |
| # Insert a tab character before each non-day header line | |
| lunch_text = "".join([s for s in lunch_text.splitlines(True) if s.strip("\r\n")]) | |
| # Split into lines and process each line for consistent indentation | |
| lines = lunch_text.split('\n') | |
| processed_lines = [] | |
| for line in lines: | |
| line = line.strip() | |
| if line: | |
| # Skip the "Luncha med oss!" line | |
| if line == "Luncha med oss!": | |
| continue | |
| # Check if this is a header line (should not be indented) | |
| elif re.match(r'^(Måndag|Tisdag|Onsdag|Torsdag|Fredag|Veckans sallad|Veckans special|Veckans vegetariska|Vecka \d+|Dagens lunch)$', line): | |
| processed_lines.append(line) | |
| elif line.startswith('__SECOND_DISH__'): | |
| # Handle second dish marker - replace with tab and add any remaining text | |
| remaining_text = line.replace('__SECOND_DISH__', '').strip() | |
| if remaining_text: | |
| processed_lines.append(f'\t{remaining_text}') | |
| else: | |
| processed_lines.append('\t') | |
| else: | |
| # This should be indented content | |
| if not line.startswith('\t'): | |
| processed_lines.append(f'\t{line}') | |
| else: | |
| processed_lines.append(line) | |
| lunch_text = '\n'.join(processed_lines) | |
| lunch_text = strip_trailing_empty_lines(lunch_text) | |
| print(lunch_text) | |
| def get_lazlos_week_menu(week_tag): | |
| lunch_menu = week_tag.find_next(id=re.compile(r'^fdm-menu-\d+$')) | |
| if not lunch_menu: | |
| print("Could not find lunch menu on the page.") | |
| return | |
| days = lunch_menu.find_all("ul") | |
| for day in days: | |
| day_name_tag = day.find("h3") | |
| if day_name_tag: | |
| print(f"\n{day_name_tag.get_text(strip=True)}") | |
| food_items = day.find_all("li", class_="fdm-item") | |
| for item in food_items: | |
| food_title_tag = item.find("p", class_="fdm-item-title") | |
| if food_title_tag: | |
| print(f"\t{food_title_tag.get_text(strip=True)}") | |
| food_content_div = item.find("div", class_="fdm-item-content") | |
| if food_content_div: | |
| subtitles = food_content_div.find_all("p") | |
| for subtitle in subtitles: | |
| subtitle_text = subtitle.get_text(strip=True) | |
| if subtitle_text: | |
| print(f"\t {subtitle_text}") | |
| def laszlos_krog(): | |
| print("\n=============") | |
| print("Laszlos Krog") | |
| print("=============") | |
| url = "https://www.laszloskrog.se/ebbepark/" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f"Error fetching page: {e}") | |
| return | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| week_tags = soup.find_all("p", class_="vecka") | |
| current_week_num = date.today().isocalendar().week | |
| for week_tag in week_tags: | |
| strong_tag = week_tag.find("strong") | |
| if not strong_tag: | |
| continue | |
| week_text = strong_tag.get_text(strip=True) | |
| match = re.match(r"VECKA\s+(\d+)", week_text, flags=re.IGNORECASE) | |
| if match: | |
| week_num = int(match.group(1)) | |
| if week_num == current_week_num: | |
| get_lazlos_week_menu(week_tag) | |
| return # Stop after finding the correct week | |
| print("Current week's menu not found.") | |
| def don_luigi(): | |
| print("\n=============") | |
| print("Don Luigi") | |
| print("=============") | |
| url = "https://www.donluigi.se/lunchmeny/" | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| except requests.RequestException as e: | |
| print(f"Error fetching page: {e}") | |
| return | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Find the entry-content div | |
| entry_content = soup.find("div", class_="entry-content") | |
| if not entry_content: | |
| print("Could not find entry-content div.") | |
| return | |
| # Find the image within the entry-content div | |
| img_tag = entry_content.find("img") | |
| if not img_tag: | |
| print("Could not find image in entry-content div.") | |
| return | |
| img_src = img_tag.get("src") | |
| if not img_src: | |
| print("Could not find image source.") | |
| return | |
| # Make sure we have the full URL | |
| if img_src.startswith("//"): | |
| img_src = "https:" + img_src | |
| elif img_src.startswith("/"): | |
| img_src = "https://www.donluigi.se" + img_src | |
| try: | |
| # Download the image | |
| img_response = requests.get(img_src) | |
| img_response.raise_for_status() | |
| # Open the image with PIL | |
| image = Image.open(io.BytesIO(img_response.content)) | |
| # Use EasyOCR to extract text from the image | |
| # Initialize EasyOCR reader with Swedish and English languages | |
| # Suppress the GPU warning by redirecting stdout temporarily | |
| import sys | |
| from io import StringIO | |
| old_stdout = sys.stdout | |
| sys.stdout = StringIO() | |
| try: | |
| reader = easyocr.Reader(['sv', 'en'], verbose=False) # Swedish and English | |
| finally: | |
| sys.stdout = old_stdout | |
| # Extract text from the image | |
| results = reader.readtext(img_response.content) | |
| # Combine all detected text | |
| text_parts = [] | |
| for (bbox, text, confidence) in results: | |
| if confidence > 0.5: # Only include text with reasonable confidence | |
| text_parts.append(text) | |
| text = ' '.join(text_parts) | |
| if not text.strip(): | |
| print("Could not extract text from the menu image.") | |
| return | |
| # Clean up the extracted text | |
| # Replace multiple spaces with single spaces and clean up | |
| text = re.sub(r'\s+', ' ', text.strip()) | |
| # Remove everything from "*Det går bra att beställa" onwards | |
| after_work_pos = text.find('*Det går bra att beställa') | |
| if after_work_pos != -1: | |
| text = text[:after_work_pos].strip() | |
| # Fix common OCR artifacts | |
| text = text.replace('_', ', ') # OCR sometimes reads commas as underscores | |
| #text = text.replace('írån', 'från') # Fix Swedish characters | |
| #text = text.replace('Falaíel', 'Falafel') # Fix OCR error | |
| #text = text.replace('49krlextra', '49kr extra') # Fix price formatting | |
| # Try to identify and format different sections | |
| formatted_text = text | |
| # Remove prices (3 or more digits) | |
| formatted_text = re.sub(r'\b\d{3,}\b', '', formatted_text) | |
| # Add line breaks before key sections | |
| formatted_text = re.sub(r'(LUNCHMENY|Veckans|Pasta|Sallad|Husman)', r'\n\1', formatted_text) | |
| formatted_text = re.sub(r'(MAN - FRE|MÅN FRE)', r'\n\1', formatted_text) | |
| # Add line breaks after specific headers and indent the following content | |
| formatted_text = re.sub(r'(Veckans Pinsa)\s+([A-ZÅÄÖ])', r'\1\n\t\2', formatted_text) | |
| formatted_text = re.sub(r'(Pasta)\s+(Pasta|[A-ZÅÄÖ])', r'\1\n\t\2', formatted_text) | |
| formatted_text = re.sub(r'(Sallad)\s+([A-ZÅÄÖ])', r'\1\n\t\2', formatted_text) | |
| formatted_text = re.sub(r'(Husman)\s+([A-ZÅÄÖ])', r'\1\n\t\2', formatted_text) | |
| # Clean up multiple newlines | |
| formatted_text = re.sub(r'\n\s*\n', '\n', formatted_text) | |
| # Split into lines for further processing | |
| lines = formatted_text.split('\n') | |
| final_lines = [] | |
| for line in lines: | |
| line = line.strip() | |
| if line: | |
| # Check if this is a main section header (should not be indented) | |
| if any(keyword in line.upper() for keyword in ['LUNCHMENY', 'MAN - FRE', 'MÅN FRE']): | |
| # This is a main section header | |
| if final_lines: # Add spacing before headers (except first) | |
| final_lines.append("") | |
| final_lines.append(line) | |
| # Check if this is a menu item header (should not be indented) | |
| elif line in ['Veckans Pinsa', 'Pasta', 'Sallad', 'Husman']: | |
| if final_lines: # Add spacing before headers (except first) | |
| final_lines.append("") | |
| final_lines.append(line) | |
| else: | |
| # Regular text, indent if not already indented | |
| if not line.startswith('\t'): | |
| final_lines.append(f"\t{line}") | |
| else: | |
| final_lines.append(line) | |
| final_text = '\n'.join(final_lines) | |
| if final_text.strip(): | |
| print(final_text) | |
| else: | |
| print("No readable menu content found in the image.") | |
| except requests.RequestException as e: | |
| print(f"Error downloading image: {e}") | |
| except Exception as e: | |
| print(f"Error processing image: {e}") | |
| if __name__ == "__main__": | |
| laszlos_krog() | |
| laluna() | |
| don_luigi() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment