christopherlovell · May 29, 2015 17:36
diff --git a/gistfile1.txt b/gistfile1.txt
 $pyscript = "C:\Users\324240\Desktop\pdfminer-master\tools\pdf2txt.py"
 $input_folder = "C:\Users\324240\Desktop\Markets Surveillance Meetings\"
 
 $output_folder = ($input_folder+"test\")
 
 ## Python pdf converter
 ## Uses the python library here https://github.com/euske/pdfminer/tree/master/tools
 #Get-ChildItem $folder -Filter *.pdf | `
 #Foreach-Object{
 #    print ($input_folder+$_.BaseName)
 #    python $pyscript $_.FullName | Out-File ($folder+"output\"+$_.BaseName+".txt")
 #}

 ## grep method for word documents
 ## from here https://woozle.org/neale/papers/docx.html
 ## - requires unzip for windows
 Get-ChildItem $input_folder -Filter *.docx | `
 Foreach-Object{
    unzip -qc $_.FullName | grep '<w:t' | sed 's/<[^<]*>//g' | grep -v '^[[:space:]]*$' | Out-File ($output_folder+$_.BaseName+".txt")
 }

 ## powershell method for word documents
 ## TODO: fix 
 #Get-ChildItem $folder -Filter *.docx | `
 #Foreach-Object{
 #    $objWord = New-Object -Com Word.Application
 #    $objWord.Visible = $false
 #    $objDocument = $objWord.Documents.Open(($folder+$_.FullName))
 #    $paras = $objDocument.Paragraphs
 #    foreach ($para in $paras)
 #    {
 #        print $para.Range.Text
 #    }
 #}
	$pyscript = "C:\Users\324240\Desktop\pdfminer-master\tools\pdf2txt.py"
	$input_folder = "C:\Users\324240\Desktop\Markets Surveillance Meetings\"

	$output_folder = ($input_folder+"test\")

	## Python pdf converter
	## Uses the python library here https://github.com/euske/pdfminer/tree/master/tools
	#Get-ChildItem $folder -Filter *.pdf \| `
	#Foreach-Object{
	# print ($input_folder+$_.BaseName)
	# python $pyscript $_.FullName \| Out-File ($folder+"output\"+$_.BaseName+".txt")
	#}

	## grep method for word documents
	## from here https://woozle.org/neale/papers/docx.html
	## - requires unzip for windows
	Get-ChildItem $input_folder -Filter *.docx \| `
	Foreach-Object{
	unzip -qc $_.FullName \| grep '<w:t' \| sed 's/<[^<]>//g' \| grep -v '^[[:space:]]$' \| Out-File ($output_folder+$_.BaseName+".txt")
	}

	## powershell method for word documents
	## TODO: fix
	#Get-ChildItem $folder -Filter *.docx \| `
	#Foreach-Object{
	# $objWord = New-Object -Com Word.Application
	# $objWord.Visible = $false
	# $objDocument = $objWord.Documents.Open(($folder+$_.FullName))
	# $paras = $objDocument.Paragraphs
	# foreach ($para in $paras)
	# {
	# print $para.Range.Text
	# }
	#}
No results found