-
-
Save asehmi/2265f2776a831b37f595b6228b56d47c to your computer and use it in GitHub Desktop.
pdf downloading utils
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import argparse | |
import requests | |
from urllib.parse import urlparse | |
from tqdm import tqdm | |
from joblib import Parallel, delayed | |
from tenacity import retry, stop_after_attempt, wait_fixed | |
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2)) | |
def download_url(url, target_dir, force_redownload): | |
filename = os.path.basename(urlparse(url).path) | |
target_path = os.path.join(target_dir, filename) | |
if os.path.exists(target_path) and not force_redownload: | |
return # File already exists and redownload not forced | |
response = requests.get(url, stream=True) | |
response.raise_for_status() # Raise an error for bad responses | |
with open(target_path, "wb") as target_file: | |
for chunk in response.iter_content(chunk_size=8192): | |
target_file.write(chunk) | |
def download_files_from_list(file_path, target_dir, n_jobs, force_redownload): | |
if not os.path.exists(target_dir): | |
os.makedirs(target_dir) | |
# Load all URLs into a list | |
with open(file_path, "r") as file: | |
urls = [line.strip() for line in file] | |
# Parallel download using joblib and tqdm for progress | |
Parallel(n_jobs=n_jobs, prefer='threads')( | |
delayed(download_url)(url, target_dir, force_redownload) | |
for url in tqdm(urls, desc="Downloading files") | |
) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Download files from a list of URLs in parallel." | |
) | |
parser.add_argument( | |
"file_path", help="Path to the text file containing URLs line by line." | |
) | |
parser.add_argument( | |
"target_dir", help="Directory where the files should be downloaded." | |
) | |
parser.add_argument( | |
"--n_jobs", | |
type=int, | |
default=-1, | |
help="Number of CPU cores to use for parallel downloading. Default is all cores.", | |
) | |
parser.add_argument( | |
"--force_redownload", | |
action="store_true", | |
help="Force re-download of files even if they already exist.", | |
) | |
args = parser.parse_args() | |
download_files_from_list( | |
args.file_path, args.target_dir, args.n_jobs, args.force_redownload | |
) | |
print(f"Files downloaded to: {args.target_dir}") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
FILE_PATH="$1" | |
TARGET_DIR="$2" | |
if [ -z "$FILE_PATH" ] || [ -z "$TARGET_DIR" ]; then | |
echo "Usage: $0 <file_path> <target_dir>" | |
exit 1 | |
fi | |
# Create the target directory if it doesn't exist | |
mkdir -p "$TARGET_DIR" | |
while IFS= read -r url; do | |
wget -P "$TARGET_DIR" "$url" | |
done < "$FILE_PATH" | |
echo "Files downloaded to: $TARGET_DIR" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
def extract_pdf_links_from_url(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, "html.parser") | |
pdf_links = [] | |
for link in soup.find_all("a"): | |
href = link.get("href") | |
if href and href.endswith(".pdf"): | |
# Convert relative URLs to absolute URLs | |
abs_url = requests.compat.urljoin(url, href) | |
pdf_links.append(abs_url) | |
return pdf_links | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Extract PDF links from a given webpage." | |
) | |
parser.add_argument("url", help="The URL of the webpage to extract from.") | |
args = parser.parse_args() | |
pdf_links = extract_pdf_links_from_url(args.url) | |
# Use the base URL to create the file name | |
base_url = urlparse(args.url).netloc | |
file_name = base_url.replace(".", "_") + "_pdf_links.txt" | |
with open(file_name, "w") as f: | |
for link in pdf_links: | |
f.write(link + "\n") | |
print(f"PDF download links written to: {file_name}") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
URL="$1" | |
if [ -z "$URL" ]; then | |
echo "Usage: $0 <URL>" | |
exit 1 | |
fi | |
# Extract the base URL for use in forming absolute paths | |
BASE_URL=$(echo "$URL" | awk -F/ '{print $3}') | |
FILENAME="${BASE_URL//./_}_pdf_links.txt" | |
wget -O - "$URL" 2>/dev/null | grep -o '<a href=['"'"'"][^"'"'"']*\.pdf['"'"'"]' | sed 's/^<a href=["'"'"']\([^"'"'"']*\)["'"'"']$/\1/g' | while read -r line; do | |
# Check if the line starts with "http", indicating it's an absolute path | |
if [[ $line == http* ]]; then | |
echo "$line" | |
else | |
# Form the full absolute URL | |
echo "http://$BASE_URL$line" | |
fi | |
done > "$FILENAME" | |
echo "PDF download links written to: $FILENAME" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment