Skip to content

Instantly share code, notes, and snippets.

@georgepar
Created October 24, 2024 12:34
Show Gist options
  • Save georgepar/b2d13a8504b1b719238eb95ee01000dd to your computer and use it in GitHub Desktop.
Save georgepar/b2d13a8504b1b719238eb95ee01000dd to your computer and use it in GitHub Desktop.
import argparse
import sys
from stream2sentence import generate_sentences
def file_or_pipe_input(file_path=None):
if file_path:
with open(file_path, "r", encoding="utf-8") as file:
yield file.read()
else:
# If no file is given, read from pipe (stdin)
yield sys.stdin.read()
def main():
parser = argparse.ArgumentParser(
description="Split text into sentences using generate_sentences"
)
parser.add_argument(
"file", nargs="?", help="File to process (optional, can be piped input instead)"
)
parser.add_argument(
"--tokenizer",
type=str,
default="nltk",
help="Tokenizer to use (nltk, stanza, etc.)",
)
parser.add_argument(
"--language", type=str, default="en", help="Language of the text"
)
parser.add_argument(
"--context_size",
type=int,
default=12,
help="Context size for sentence generation",
)
parser.add_argument(
"--context_size_look_overhead",
type=int,
default=12,
help="Context look overhead",
)
parser.add_argument(
"--min_sentence_length", type=int, default=10, help="Minimum sentence length"
)
parser.add_argument(
"--min_first_fragment_length",
type=int,
default=10,
help="Minimum first fragment length",
)
parser.add_argument(
"--quick_yield_single",
action="store_true",
help="Quick yield single sentence fragment",
)
parser.add_argument(
"--quick_yield_all", action="store_true", help="Quick yield all sentences"
)
parser.add_argument(
"--quick_yield_every", action="store_true", help="Quick yield every fragment"
)
parser.add_argument(
"--cleanup_links", action="store_true", help="Clean up text links"
)
parser.add_argument(
"--cleanup_emojis", action="store_true", help="Clean up text emojis"
)
args = parser.parse_args()
# Stream file or pipe input
stream = file_or_pipe_input(args.file)
# Call generate_sentences with user-defined or default parameters
for sentence in generate_sentences(
stream,
tokenizer=args.tokenizer,
language=args.language,
context_size=args.context_size,
context_size_look_overhead=args.context_size_look_overhead,
minimum_sentence_length=args.min_sentence_length,
minimum_first_fragment_length=args.min_first_fragment_length,
quick_yield_single_sentence_fragment=args.quick_yield_single,
quick_yield_for_all_sentences=args.quick_yield_all,
quick_yield_every_fragment=args.quick_yield_every,
cleanup_text_links=args.cleanup_links,
cleanup_text_emojis=args.cleanup_emojis,
):
print(sentence)
if __name__ == "__main__":
main()
import argparse
import requests
import sys
from bs4 import BeautifulSoup
def fetch_wikipedia_content(url):
# Get the page content from Wikipedia
response = requests.get(url)
response.raise_for_status() # Ensure the request was successful
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Extract the main content of the Wikipedia page
content = soup.find(id="mw-content-text")
# Find all the paragraphs and extract the text
paragraphs = content.find_all("p")
page_text = "\n".join([p.get_text() for p in paragraphs])
return page_text
def main():
parser = argparse.ArgumentParser(
description="Download and print Wikipedia page text content."
)
parser.add_argument("url", help="The URL of the Wikipedia page to download.")
args = parser.parse_args()
try:
# Fetch and print Wikipedia content
page_text = fetch_wikipedia_content(args.url)
print(page_text)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment