Created
March 8, 2015 07:20
-
-
Save datnt/6e1d128089bef283b7c5 to your computer and use it in GitHub Desktop.
My solution to extract email and link from pdf data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Public: Class that parses the word extraction output from the `pdftoolkit-extract` | |
# tool and returns web links and emails to be added to a publication at a | |
# later stage. | |
# | |
# The format in which the tool outputs is as follows: | |
# | |
# <WORD> <X1> <Y1> <X2> <Y2> | |
# | |
# An example of what the original PDF looks like can be find here: | |
# | |
# http://i.imgur.com/LqFxvzE.png | |
class LinkExtractor | |
# Public: Initializes a link extractor with the raw output from the extraction | |
# tool. | |
def initialize(words) | |
@words = words | |
end | |
# Public: Runs link detection and yields each result as links are picked up. | |
def each_link | |
detect_links.each do |link| | |
yield link | |
end | |
end | |
private | |
# Private: The detection algorithm. | |
def detect_links | |
# TODO: Implement method | |
terms = get_term(@words) | |
links = [] | |
terms.each do |term| | |
parts = term.split("\n") | |
readable = [] | |
x1s = [] | |
x2s = [] | |
y1s = [] | |
y2s = [] | |
#TODO: thinking of solution which use single Block, trigger "split" once, | |
# and assigned all: readable, x1s, x2s, y1s, y2s | |
parts.select { |x| readable << x.split(" ")[0] } | |
parts.select { |x| x1s << x.split(" ")[1] } | |
parts.select { |x| y1s << x.split(" ")[2] } | |
parts.select { |x| x2s << x.split(" ")[3] } | |
parts.select { |x| y2s << x.split(" ")[4] } | |
#TODO: create a helper action to validate valid email | |
# create a helper action to validate valid link | |
readable.each_with_index do |text,i| | |
if text == "@" && (i+3) < readable.size | |
links << { | |
:type=>:email, | |
:content=> "#{readable[i-1]}#{readable[i]}#{readable[i+1]}#{readable[i+2]}#{readable[i+3]}", | |
:rect=> | |
{ | |
:x1=> x1s[i-1].to_f, | |
:x2=> x2s[i+3].to_f, | |
:y1=> y1s[i-1].to_f, | |
:y2=> y2s[i+3].to_f | |
} | |
} | |
elsif (text == "www" && readable[i+1]=="." && (i+4 < readable.size) ) | |
links << { | |
:type=>:link, | |
:content=>"#{text}#{readable[i+1]}#{readable[i+2]}#{readable[i+3]}#{readable[i+4]}", | |
:rect=> | |
{ | |
:x1=>x1s[i].to_f, | |
:x2=>x2s[i+4].to_f, | |
:y1=>y1s[i].to_f, | |
:y2=>y2s[i+4].to_f | |
} | |
} | |
end | |
end | |
end | |
links | |
end | |
def get_term(words) | |
words.split("\n\n") | |
end | |
end | |
require 'minitest/autorun' | |
class LinkExtractorTest < Minitest::Unit::TestCase | |
def test_emails | |
fixture = <<-STR | |
CALL 250.934100 182.967400 282.310500 173.787400 | |
: 283.425300 182.967400 285.117300 173.787400 | |
01327 289.278900 182.967400 323.618100 173.787400 | |
305232 326.447700 182.967400 372.389700 173.787400 | |
West 138.282900 165.399400 168.477300 153.951400 | |
- 171.998100 165.399400 176.162100 153.951400 | |
wdistric 179.273700 165.399400 226.673700 153.951400 | |
@ 228.076500 165.399400 239.068500 153.951400 | |
ford 240.339300 165.399400 264.020100 153.951400 | |
. 265.349700 165.399400 267.041700 153.951400 | |
com 268.059300 165.399400 293.304900 153.951400 | |
Trust 320.753700 165.399400 351.342900 153.951400 | |
Ford 355.294500 165.399400 381.124500 153.951400 | |
/ 381.951300 165.399400 388.359300 153.951400 | |
Retail 389.330100 165.399400 423.173700 153.951400 | |
- 426.693300 165.399400 430.857300 153.951400 | |
retail12 434.664900 165.399400 476.472900 153.951400 | |
@ 478.115700 165.399400 489.107700 153.951400 | |
ford 490.378500 165.399400 514.060500 153.951400 | |
. 515.390100 165.399400 517.082100 153.951400 | |
com 518.100900 165.399400 543.347700 153.951400 | |
North 94.534500 147.399400 127.539300 135.951400 | |
/ 128.326500 147.399400 134.734500 135.951400 | |
SNIC 134.877300 147.399400 164.742900 135.951400 | |
- 168.198900 147.399400 172.362900 135.951400 | |
nsnic123 176.166900 147.399400 226.365300 135.951400 | |
@ 228.016500 147.399400 239.008500 135.951400 | |
ford 240.275700 147.399400 263.957700 135.951400 | |
. 265.287300 147.399400 266.979300 135.951400 | |
com 267.996900 147.399400 293.238900 135.951400 | |
East 321.407700 147.399400 346.836900 135.951400 | |
- 350.357700 147.399400 354.521700 135.951400 | |
edistric 358.001700 147.399400 401.745300 135.951400 | |
STR | |
assert_equal links_for(fixture), [ | |
{:type=>:email, :content=>"[email protected]", :rect=>{:x1=>179.2737, :x2=>293.3049, :y1=>165.3994, :y2=>153.9514}}, | |
{:type=>:email, :content=>"[email protected]", :rect=>{:x1=>434.6649, :x2=>543.3477, :y1=>165.3994, :y2=>153.9514}}, | |
{:type=>:email, :content=>"[email protected]", :rect=>{:x1=>176.1669, :x2=>293.2389, :y1=>147.3994, :y2=>135.9514}} | |
] | |
end | |
def test_urls | |
fixture = <<-STR | |
0181 339.203831 221.163031 356.562792 211.117225 | |
- 358.920122 221.163031 361.306201 211.117225 | |
458845 361.808957 221.163031 390.528143 211.117225 | |
| 394.763276 221.163031 395.515513 211.117225 | |
www 399.430156 221.163031 422.122463 211.117225 | |
. 421.572410 221.163031 423.575183 211.117225 | |
pols 424.251228 221.163031 443.201512 211.117225 | |
. 443.651914 221.163031 445.654686 211.117225 | |
nl 446.661196 221.163031 455.280747 211.117225 | |
5 306.554777 188.411494 310.363879 178.371012 | |
. 311.461091 188.411494 312.630174 178.371012 | |
03 313.425103 188.411494 322.188432 178.371012 | |
CASE 339.382200 188.411494 361.757467 178.371012 | |
IH 365.514557 188.411494 373.635849 178.371012 | |
& 377.382365 188.411494 383.184656 178.371012 | |
Steyr 386.360906 188.411494 406.465606 178.371012 | |
De 339.650514 177.497927 350.637016 168.676570 | |
Kolk 354.436100 177.497927 372.006923 168.676570 | |
2 374.862116 177.497927 379.097645 168.676570 | |
, 379.806761 177.497927 381.062087 168.676570 | |
8255 383.645969 177.497927 402.346499 168.676570 | |
PE 406.366023 177.497927 415.307022 168.676570 | |
SWIFTERBANT 418.718047 177.497927 482.538383 168.676570 | |
0321 339.203831 166.605654 356.562792 156.549201 | |
- 358.920122 166.605654 361.306201 156.549201 | |
335515 362.326420 166.605654 390.528143 156.549201 | |
| 394.763276 166.605654 395.515513 156.549201 | |
www 399.430156 166.605654 422.122463 156.549201 | |
. 421.572410 166.605654 423.575183 156.549201 | |
caseih 424.653699 166.605654 453.066849 156.549201 | |
. 453.751970 166.605654 455.754743 156.549201 | |
com 456.833259 166.605654 475.524594 156.549201 | |
STR | |
assert_equal links_for(fixture), [ | |
{:type=>:link, :content=>"www.pols.nl", :rect=>{:x1=>399.430156, :x2=>455.280747, :y1=>221.163031, :y2=>211.117225}}, | |
{:type=>:link, :content=>"www.caseih.com", :rect=>{:x1=>399.430156, :x2=>475.524594, :y1=>166.605654, :y2=>156.549201}} | |
] | |
end | |
def test_false_positives | |
fixture = <<-STR | |
kand 88.149000 210.623500 100.209000 205.103500 | |
. 101.151000 210.623500 101.655000 205.103500 | |
rechten 102.597000 210.623500 121.227000 205.103500 | |
UFSIA 123.639000 210.623500 139.515000 205.103500 | |
91 141.345000 210.623500 146.547000 205.103500 | |
, 148.371000 210.623500 148.899000 205.103500 | |
bijz 151.509000 210.623500 159.489000 205.103500 | |
. 160.173000 210.623500 160.677000 205.103500 | |
lic 161.625000 210.623500 166.425000 205.103500 | |
. 167.175000 210.623500 167.679000 205.103500 | |
int 168.627000 210.623500 174.183000 205.103500 | |
. 174.951000 210.623500 175.455000 205.103500 | |
pol 176.397000 210.623500 183.561000 205.103500 | |
UIA 185.985000 210.623500 194.859000 205.103500 | |
93 196.689000 210.623500 202.899000 205.103500 | |
Plataandreef 88.221000 203.621500 119.775000 198.425500 | |
9 121.509000 203.621500 124.401000 198.425500 | |
, 125.169000 203.621500 125.697000 198.425500 | |
2900 128.103000 203.621500 140.937000 198.425500 | |
SCHOTEN 142.917000 203.621500 169.935000 198.425500 | |
, 170.889000 203.621500 171.417000 198.425500 | |
✆ 87.999000 196.581437 92.319000 191.423500 | |
03 94.335000 196.581437 100.515000 191.423500 | |
- 101.103000 196.581437 102.603000 191.423500 | |
658 103.173000 196.581437 112.659000 191.423500 | |
38 114.627000 196.581437 120.909000 191.423500 | |
39 122.877000 196.581437 129.141000 191.423500 | |
, 129.909000 196.581437 130.437000 191.423500 | |
7 133.133836 196.581437 137.469773 191.423500 | |
03 139.791000 196.581437 145.971000 191.423500 | |
- 146.559000 196.581437 148.059000 191.423500 | |
658 148.629000 196.581437 158.115000 191.423500 | |
30 160.083000 196.581437 166.383000 191.423500 | |
92 168.399000 196.581437 174.483000 191.423500 | |
ADRIAENSEN 247.988800 465.641300 286.946800 460.259300 | |
, 287.726800 465.641300 288.668800 460.259300 | |
OSWALD 290.924800 465.641300 316.688200 460.259300 | |
, 317.486200 465.641300 318.014200 460.259300 | |
b 320.624200 465.641300 323.480200 460.259300 | |
. 324.290200 465.641300 324.794200 460.259300 | |
w 325.424200 465.641300 329.774200 460.259300 | |
. 330.074200 465.641300 330.578200 460.259300 | |
ir 331.526200 465.641300 334.160200 460.259300 | |
. 334.190200 465.641300 334.694200 460.259300 | |
UGent 337.334200 465.641300 353.426200 460.259300 | |
53 355.496200 465.641300 361.694200 460.259300 | |
STR | |
assert_empty links_for(fixture) | |
end | |
def links_for(fixture) | |
([]).tap do |links| | |
LinkExtractor.new(fixture).each_link { |l| links << l } | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment