Last active
March 13, 2025 13:45
-
-
Save duoduoyeah/6c74d7a628a8285c275e515e87cf2b2a to your computer and use it in GitHub Desktop.
This script will convert a pdf with double column to a pdf with one column. Some conference papers has double column and its hard to detect by OCR.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Converts a double-column PDF to a single-column PDF by extracting | |
# each column separately and then interleaving the pages. | |
# | |
# Usage: | |
# ./2one.sh <input.pdf> <output.pdf> | |
# | |
# Arguments: | |
# input.pdf: The path to the input PDF file. | |
# output.pdf: The path to the output PDF file. | |
# Check if correct arguments provided | |
if [ "$#" -ne 2 ]; then | |
echo "Usage: $0 <input_pdf> <output_pdf>" | |
exit 1 | |
fi | |
INPUT_PDF="$1" | |
OUTPUT_PDF="$2" | |
CURRENT_DIR=$(pwd) | |
# If input doesn't contain path, assume current directory | |
if [[ "$INPUT_PDF" != */* ]]; then | |
INPUT_PDF="$CURRENT_DIR/$INPUT_PDF" | |
fi | |
# If output doesn't contain path, assume current directory | |
if [[ "$OUTPUT_PDF" != */* ]]; then | |
OUTPUT_PDF="$CURRENT_DIR/$OUTPUT_PDF" | |
fi | |
LEFT_COLUMN="$CURRENT_DIR/left_column_temp.pdf" | |
RIGHT_COLUMN="$CURRENT_DIR/right_column_temp.pdf" | |
# Get page size in points | |
echo "Getting page dimensions..." | |
page_width_pts=$(pdfinfo "$INPUT_PDF" | grep "Page size" | awk '{print $3}') | |
half_width_pts=$(echo "scale=0; $page_width_pts / 2" | bc) | |
echo "Page width in pts: $page_width_pts" | |
echo "Half width in pts: $half_width_pts" | |
# Extract left and right columns | |
echo "Extracting left column..." | |
pdfjam --papersize "{${half_width_pts}pt,22cm}" --trim "0pt 0pt ${half_width_pts}pt 0pt" --clip true "$INPUT_PDF" --outfile "$LEFT_COLUMN" | |
echo "Extracting right column..." | |
pdfjam --papersize "{${half_width_pts}pt,22cm}" --trim "${half_width_pts}pt 0pt 0pt 0pt" --clip true "$INPUT_PDF" --outfile "$RIGHT_COLUMN" | |
# Count pages in each PDF | |
pages_left=$(pdfinfo "$LEFT_COLUMN" | grep Pages | awk '{print $2}') | |
pages_right=$(pdfinfo "$RIGHT_COLUMN" | grep Pages | awk '{print $2}') | |
# Create shuffle pattern | |
pattern="" | |
for ((i=1; i<=pages_left; i++)); do | |
pattern+="A$i B$i " | |
done | |
# Run pdftk with generated pattern | |
# The issue was here - can't use both shuffle and cat in the same command | |
pdftk A="$LEFT_COLUMN" B="$RIGHT_COLUMN" shuffle $pattern output "$OUTPUT_PDF" | |
# Clean up temporary files | |
rm "$LEFT_COLUMN" "$RIGHT_COLUMN" | |
echo "Created $OUTPUT_PDF with alternating pages from left and right columns of $INPUT_PDF" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment