Created
October 22, 2014 02:52
-
-
Save jlisic/5c1342e525eb3977d564 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use Lingua::EN::Fathom; | |
# this is a simple perl file to go along with linga.sas | |
# this work is derived from the example at CPAN: | |
# http://search.cpan.org/dist/Lingua-EN-Fathom/lib/Lingua/EN/Fathom.pm | |
# create object | |
my $text = new Lingua::EN::Fathom; | |
# read in text file | |
$text->analyse_file( $ARGV[0] ); | |
# get data | |
$accumulate = 1; | |
$text->analyse_block($text_string,$accumulate); | |
# get the info from text | |
$num_chars = $text->num_chars; | |
$num_words = $text->num_words; | |
$percent_complex_words = $text->percent_complex_words; | |
$num_sentences = $text->num_sentences; | |
$num_text_lines = $text->num_text_lines; | |
$num_blank_lines = $text->num_blank_lines; | |
$num_paragraphs = $text->num_paragraphs; | |
$syllables_per_word = $text->syllables_per_word; | |
$words_per_sentence = $text->words_per_sentence; | |
$fog = $text->fog; | |
$flesch = $text->flesch; | |
$kincaid = $text->kincaid; | |
# print it all out in a way that can be parsed easily | |
print("num_chars $num_chars\n"); | |
print("num_words $num_words\n"); | |
print("percent_complex_words $percent_complex_words \n") ; | |
print("num_sentences $num_sentences\n"); | |
print("num_text_lines $num_text_lines\n"); | |
print("num_blank_lines $num_blank_lines\n"); | |
print("num_paragraphs $num_paragraphs\n"); | |
print("syllables_per_word $syllables_per_word\n"); | |
print("words_per_sentence $words_per_sentence\n"); | |
print("fog $fog\n"); | |
print("flesch $flesch\n"); | |
print("kincaid $kincaid"); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/**************************************************************************/ | |
/* Name: lingua.sas */ | |
/* Author: Jonathan Lisic <[email protected]> */ | |
/**************************************************************************/ | |
/* Description: */ | |
/* */ | |
/* lingua is a macro based on the example for the Lingua Module in CPAN */ | |
/* http://search.cpan.org/dist/Lingua-EN-Fathom/lib/Lingua/EN/Fathom.pm */ | |
/**************************************************************************/ | |
/* Notes: */ | |
/* */ | |
/* This is not the fastest code around but it is simple and it works! */ | |
/* It is assumed that you have strawberry perl installed with the linga */ | |
/* perl module */ | |
/**************************************************************************/ | |
/* macro */ | |
/* workDir - Temporary directory where lingua.pl also is */ | |
/* varName - The data set to apply this to (right now only supports one */ | |
/* input - data set that contains varName */ | |
/* output - output data set */ | |
/* | |
Example: | |
%let workDir = C:\Users\JonathanLisic\src\lingua; | |
%let varName = myVeryLongString; | |
%let input = work.input; | |
%let output = work.output; | |
%lingua( &workDir, &varName, &input, &output ); | |
*/ | |
%macro lingua (workDir, varName, input, output); | |
/* create initial data set */ | |
data &output.; | |
set _NULL_; | |
run; | |
/* get number of rows in data set */ | |
proc sql noprint; | |
select distinct count(*) into :rows from &input.; /* separated by ' ';*/ | |
quit; | |
%do i = 1 %to &rows.; | |
/* write out the variable */ | |
filename lingOut "&workDir.\sample.txt"; | |
data _NULL_; | |
set &input.; | |
file lingOut; | |
if (_N_ = &i.) then do; | |
put &varName.; | |
end; | |
run; | |
/* now we create the named pipe */ | |
filename lingua pipe "c:\Strawberry\perl\bin\perl &workDir.\lingua.pl &workDir.\sample.txt"; | |
/* create data set */ | |
data linguaTmp (drop = line); | |
infile lingua truncover; | |
input line $char80.; | |
variable=scan(line,1,' '); | |
value=input(substr(line,length(variable)+1),best8.); | |
/* sorting to remove the occasional repleated line */ | |
proc sort data=linguaTmp nodup; | |
by variable value; | |
run; | |
/* transpose the output */ | |
proc transpose data=linguaTmp out=linguaTmp (drop=_name_); | |
id variable; | |
var value; | |
run; | |
/* append to output */ | |
data &output.; | |
set &output. linguaTmp (keep= flesch fog kincaid num_blank_lines num_chars num_paragraphs num_sentences num_text_lines num_words percent_complex_words syllables_per_word words_per_sentence); | |
run; | |
%end; | |
%mend; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment