Created
October 24, 2024 12:34
-
-
Save georgepar/b2d13a8504b1b719238eb95ee01000dd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import sys | |
from stream2sentence import generate_sentences | |
def file_or_pipe_input(file_path=None): | |
if file_path: | |
with open(file_path, "r", encoding="utf-8") as file: | |
yield file.read() | |
else: | |
# If no file is given, read from pipe (stdin) | |
yield sys.stdin.read() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Split text into sentences using generate_sentences" | |
) | |
parser.add_argument( | |
"file", nargs="?", help="File to process (optional, can be piped input instead)" | |
) | |
parser.add_argument( | |
"--tokenizer", | |
type=str, | |
default="nltk", | |
help="Tokenizer to use (nltk, stanza, etc.)", | |
) | |
parser.add_argument( | |
"--language", type=str, default="en", help="Language of the text" | |
) | |
parser.add_argument( | |
"--context_size", | |
type=int, | |
default=12, | |
help="Context size for sentence generation", | |
) | |
parser.add_argument( | |
"--context_size_look_overhead", | |
type=int, | |
default=12, | |
help="Context look overhead", | |
) | |
parser.add_argument( | |
"--min_sentence_length", type=int, default=10, help="Minimum sentence length" | |
) | |
parser.add_argument( | |
"--min_first_fragment_length", | |
type=int, | |
default=10, | |
help="Minimum first fragment length", | |
) | |
parser.add_argument( | |
"--quick_yield_single", | |
action="store_true", | |
help="Quick yield single sentence fragment", | |
) | |
parser.add_argument( | |
"--quick_yield_all", action="store_true", help="Quick yield all sentences" | |
) | |
parser.add_argument( | |
"--quick_yield_every", action="store_true", help="Quick yield every fragment" | |
) | |
parser.add_argument( | |
"--cleanup_links", action="store_true", help="Clean up text links" | |
) | |
parser.add_argument( | |
"--cleanup_emojis", action="store_true", help="Clean up text emojis" | |
) | |
args = parser.parse_args() | |
# Stream file or pipe input | |
stream = file_or_pipe_input(args.file) | |
# Call generate_sentences with user-defined or default parameters | |
for sentence in generate_sentences( | |
stream, | |
tokenizer=args.tokenizer, | |
language=args.language, | |
context_size=args.context_size, | |
context_size_look_overhead=args.context_size_look_overhead, | |
minimum_sentence_length=args.min_sentence_length, | |
minimum_first_fragment_length=args.min_first_fragment_length, | |
quick_yield_single_sentence_fragment=args.quick_yield_single, | |
quick_yield_for_all_sentences=args.quick_yield_all, | |
quick_yield_every_fragment=args.quick_yield_every, | |
cleanup_text_links=args.cleanup_links, | |
cleanup_text_emojis=args.cleanup_emojis, | |
): | |
print(sentence) | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import requests | |
import sys | |
from bs4 import BeautifulSoup | |
def fetch_wikipedia_content(url): | |
# Get the page content from Wikipedia | |
response = requests.get(url) | |
response.raise_for_status() # Ensure the request was successful | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.text, "html.parser") | |
# Extract the main content of the Wikipedia page | |
content = soup.find(id="mw-content-text") | |
# Find all the paragraphs and extract the text | |
paragraphs = content.find_all("p") | |
page_text = "\n".join([p.get_text() for p in paragraphs]) | |
return page_text | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Download and print Wikipedia page text content." | |
) | |
parser.add_argument("url", help="The URL of the Wikipedia page to download.") | |
args = parser.parse_args() | |
try: | |
# Fetch and print Wikipedia content | |
page_text = fetch_wikipedia_content(args.url) | |
print(page_text) | |
except Exception as e: | |
print(f"Error: {e}", file=sys.stderr) | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment