Created
May 29, 2015 17:36
-
-
Save christopherlovell/c2e210b1d8d758a4f0e7 to your computer and use it in GitHub Desktop.
powershell pdf/word parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$pyscript = "C:\Users\324240\Desktop\pdfminer-master\tools\pdf2txt.py" | |
$input_folder = "C:\Users\324240\Desktop\Markets Surveillance Meetings\" | |
$output_folder = ($input_folder+"test\") | |
## Python pdf converter | |
## Uses the python library here https://github.com/euske/pdfminer/tree/master/tools | |
#Get-ChildItem $folder -Filter *.pdf | ` | |
#Foreach-Object{ | |
# print ($input_folder+$_.BaseName) | |
# python $pyscript $_.FullName | Out-File ($folder+"output\"+$_.BaseName+".txt") | |
#} | |
## grep method for word documents | |
## from here https://woozle.org/neale/papers/docx.html | |
## - requires unzip for windows | |
Get-ChildItem $input_folder -Filter *.docx | ` | |
Foreach-Object{ | |
unzip -qc $_.FullName | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$' | Out-File ($output_folder+$_.BaseName+".txt") | |
} | |
## powershell method for word documents | |
## TODO: fix | |
#Get-ChildItem $folder -Filter *.docx | ` | |
#Foreach-Object{ | |
# $objWord = New-Object -Com Word.Application | |
# $objWord.Visible = $false | |
# $objDocument = $objWord.Documents.Open(($folder+$_.FullName)) | |
# $paras = $objDocument.Paragraphs | |
# foreach ($para in $paras) | |
# { | |
# print $para.Range.Text | |
# } | |
#} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment