Skip to content

Instantly share code, notes, and snippets.

@yazdipour
Created October 10, 2024 11:25
Show Gist options
  • Save yazdipour/e7b4a36ae266d379295aaedfce99a3dc to your computer and use it in GitHub Desktop.
Save yazdipour/e7b4a36ae266d379295aaedfce99a3dc to your computer and use it in GitHub Desktop.
# A python scrapper that https://www.einbuergerungstest-online.eu/fragen/{page_number_start_from_1_to_10}/ and scrapes the questions and answers from the page.
# The scrapper should return the questions (class=questions-question-text) and multiple choose options (li elements inside class=list-unstyled question-answers-list and class=question-answer-right is the answer) and progress percentage(class=progress-bar-success) in a json format.
# The scrapper should be able to scrape all the 10 pages and save it in a json file.
# The output should be in the following format:
# {
# "questions": [
# {
# "question_id": "1",
# "question": "In Deutschland dürfen Menschen offen etwas gegen die Regierung sagen, weil …",
# "options": [
# "hier Religionsfreiheit gilt.",
# "die Menschen Steuern zahlen.",
# "die Menschen das Wahlrecht haben.",
# "hier Meinungsfreiheit gilt."
# ],
# "answer": "✅ hier Meinungsfreiheit gilt.",
# "percentage": 91,
# "language": "de"
# }
# ]
# }
# <div class="content container">
# <h2>Die Fragen mit Antworten</h2>
# <div class="questions-question-stats">
# <div class="progress" title="In 90,6% der bisher abgeschlossenen Tests wurde diese Frage richtig beantwortet">
# <div class="progress-bar progress-bar-success" style="width:91%">91%</div>
# <div class="progress-bar progress-bar-danger" style="width:9%"></div>
# </div>
# </div>
# <div class="row" id="frage-1">
# <div class="col-sm-6">
# <div class="questions-question-text">
# <div class="questions-question-id">1.</div>
# <p><a href="/fragen/1-in-deutschland-duerfen-menschen-offen-etwas-gegen-die-regierung-sagen-weil/">In Deutschland dürfen Menschen offen etwas gegen die Regierung sagen, weil …</a></p>
# </div>
# </div>
# <div class="col-xs-11 col-xs-offset-1 col-sm-offset-0 col-sm-6">
# <ul class="list-unstyled question-answers-list">
# <li>hier Religionsfreiheit gilt.</li>
# <li>die Menschen Steuern zahlen.</li>
# <li>die Menschen das Wahlrecht haben.</li>
# <li><span class="question-answer-right">hier Meinungsfreiheit gilt.</span></li>
# </ul>
# </div>
# </div>
# <div class="questions-question-stats">
# <div class="progress" title="In 92,3% der bisher abgeschlossenen Tests wurde diese Frage richtig beantwortet">
# <div class="progress-bar progress-bar-success" style="width:92%">92%</div>
# <div class="progress-bar progress-bar-danger" style="width:8%"></div>
# </div>
# </div>
# <div class="row" id="frage-2">
# <div class="col-sm-6">
# <div class="questions-question-text">
# <div class="questions-question-id">2.</div>
# <p>In Deutschland können Eltern bis zum 14. Lebensjahr ihres Kindes entscheiden, ob es in der Schule am …</p>
# </div>
# </div>
# <div class="col-xs-11 col-xs-offset-1 col-sm-offset-0 col-sm-6">
# <ul class="list-unstyled question-answers-list">
# <li>Geschichtsunterricht teilnimmt.</li>
# <li><span class="question-answer-right">Religionsunterricht teilnimmt.</span></li>
# <li>Politikunterricht teilnimmt.</li>
# <li>Sprachunterricht teilnimmt.</li>
# </ul>
# </div>
# </div>
# <div class="questions-question-stats">
# <div class="progress" title="In 94,2% der bisher abgeschlossenen Tests wurde diese Frage richtig beantwortet">
# <div class="progress-bar progress-bar-success" style="width:94%">94%</div>
# <div class="progress-bar progress-bar-danger" style="width:6%"></div>
# </div>
# </div>
# <div class="search-box search-box--bottom">
# <form action="/suche/" method="get">
# <p>Einbürgerungstest-Fragen durchsuchen:</p>
# <div class="search-box__input"><input type="hidden" name="cx" value="partner-pub-7803896590914046:7122373411"><input type="hidden" name="cof" value="FORID:10"><input class="form-control" type="text" name="q" value=""><button class="btn btn-default">🔍</button></div>
# </form>
# </div>
# </div>
import requests
from bs4 import BeautifulSoup
import json
def scrape_questions():
language = 'de'
base_url = "https://www.einbuergerungstest-online.eu/fragen"
all_questions = []
for page_number in range(1, 11):
url = f"{base_url}/{page_number}/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
questions = soup.find_all('div', class_='row')
# get all stats
stats = soup.find_all('div', class_='questions-question-stats')
for question, stat in zip(questions, stats):
question_id = question.find('div', class_='questions-question-text').div.text.strip()[:-1]
question_text = question.find('div', class_='questions-question-text').p.text.strip()
options = [li.text.strip() for li in question.find('ul', class_='question-answers-list').find_all('li')]
answer = question.find('span', class_='question-answer-right').text.strip()
percentage = int(stat.find('div', class_='progress-bar-success').text.strip()[:-1])
all_questions.append({
"question_id": question_id,
"question": question_text,
"options": options,
"answer": f"✅ {answer}",
"percentage": percentage,
"language": language
})
with open(language + '.json', 'w', encoding='utf-8') as f:
json.dump({"questions": all_questions}, f, ensure_ascii=False, indent=4)
# scrape_questions()
from notion_client import Client
NOTION_TOKEN = "secret_v"
# And then write a python script that reads the json file. Use Notion API to create a new item in your notion database and add the questions and answers in the page.
# Each items has the following properties:
# - Question Number
# - Question
# - Option 1
# - Option 2
# - Option 3
# - Option 4
# - Answer
# - Percentage
# - Language
def add_questions_to_notion(token, database_id):
language = 'persisch'
client = Client(auth=token)
with open(language + '.json', 'r', encoding='utf-8') as f:
data = json.load(f)
questions = data['questions']
for question in questions:
response = client.pages.create(
parent={"database_id": database_id},
properties={
"Nr": {"number": int(question['question_id'])},
"Question": {"title": [{"text": {"content": question['question']}}]},
"Option 1": {"rich_text": [{"text": {"content": question['options'][0]}}]},
"Option 2": {"rich_text": [{"text": {"content": question['options'][1]}}]},
"Option 3": {"rich_text": [{"text": {"content": question['options'][2]}}]},
"Option 4": {"rich_text": [{"text": {"content": question['options'][3]}}]},
"Answer": {"rich_text": [{"text": {"content": question['answer']}}]},
"Percentage": {"number": question['percentage']},
"Language": {"rich_text": [{"text": {"content": question['language']}}]}
}
)
print(question['question_id'], 'added to notion')
add_questions_to_notion('TOKEN', 'DATABASE_ID')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment