xaprb · January 18, 2014 15:49
diff --git a/analyze_text.pl b/analyze_text.pl
 #!/usr/bin/perl

 use strict;
 use warnings FATAL => 'all';

 use English qw(-no_match_vars);
 use Lingua::EN::Fathom;
 use TeX::Hyphen;
 use List::Util qw(min);

 my $filename = shift or die "Give me a filename";

 my $hyp = new TeX::Hyphen();
   # file     => 'hyphen.tex',
   # style    => 'czech',
   # leftmin  => 2,
   # rightmin => 2,
   # my @points = $hyp->hyphenate($word);

 my $f = new Lingua::EN::Fathom;

 # First print out a report on the whole file.
 $f->analyse_file($filename);
 print($f->report);

 =pod
 Number of characters       : 65102
 Number of words            : 10707
 Percent of complex words   : 15.36
 Average syllables per word : 1.6678
 Number of sentences        : 579
 Average words per sentence : 18.4922
 Number of text lines       : 246
 Number of blank lines      : 248
 Number of paragraphs       : 246


 READABILITY INDICES

 Fog                        : 13.5424
 Flesch                     : 46.9706
 Flesch-Kincaid             : 11.3019
 =cut

 =pod
 # Print out the longest words and their frequency.
 my %words = $f->unique_words;
 my @longest = reverse sort {
      length($a) * $words{$a} <=> length($b) * $words{$b}
   } keys %words;
 @longest = grep { length($_) > 4 } @longest;
 foreach my $i ( 0 .. min(30, $#longest ) ) {
   printf("%-30s %5d\n", $longest[$i], $words{$longest[$i]});
 }
 =cut

 # Now find the longest and least readable paragraphs, and the sentences with the
 # most syllables.  These metrics are a 1 if more is better, and a 0 else.
 my %metrics = (
   num_chars             => 0,
   num_words             => 0,
   percent_complex_words => 0,
   num_sentences         => 0,
   syllables_per_word    => 0,
   words_per_sentence    => 0,
   fog                   => 0,
   flesch                => 1,
   kincaid               => 0,
 );

 my @stats;
 my %syll_for;
 open my $fi, "<", $filename or die $OS_ERROR;
 while ( my $line = <$fi> ) {
   chomp $line;
   next unless $line;

   my $st = { line => $line };
   $f->analyse_block($line);
   foreach my $metric ( keys %metrics ) {
      $st->{$metric} = $f->$metric();
   }

   # My own metric: syllables per paragraph and syllables per sentence.
   $st->{tex_syllables} = 0;
   foreach my $sentence ( split(/[.!?]\s+(?=[A-Z])/, $line ) ) {
      $sentence =~ s/[^\w\s]+//g;
      foreach my $word ( split(/\s+/, $sentence ) ) {
         my $hy = $hyp->hyphenate($word);
         $syll_for{$sentence} += $hy;
         $st->{tex_syllables} += $hy;
      }
   }

   push @stats, $st;
 }
 close $fi;

 my %worst_overall;

 foreach my $metric ( qw(tex_syllables), keys %metrics ) {
   my @top = sort { $a->{$metric} <=> $b->{$metric} } @stats;
   if ( !$metrics{$metric} ) {
      @top = reverse @top;
   }
   if ( $metric eq 'flesch' ) {
      @top = grep { $_->{flesch} != 0 } @top;
   }
   print "====================== $metric\n";
   foreach my $i ( 0 .. min(30, $#top) ) {
      my $st = $top[$i];
      printf("%6.4f %s\n", $st->{$metric}, substr($st->{line}, 0, 70));
      $worst_overall{$st->{line}}++;
   }
   print "\n";
 }

 # Syllables-per-sentence report.
 my @top = reverse sort { $syll_for{$a} <=> $syll_for{$b} } keys %syll_for;
 print "====================== syllables per sentence\n";
 foreach my $i ( 0 .. min(30, $#top) ) {
   my $sent = $top[$i];
   print $syll_for{$sent}, ' ', $sent, "\n";
 }

 # Lines that showed up as bad in most of the Fathom reports.
 @top = reverse sort { $worst_overall{$a} <=> $worst_overall{$b} } keys %worst_overall;
 print "\n====================== worst overall\n";
 foreach my $i ( 0 .. min(30, $#top) ) {
   my $sent = $top[$i];
   print $worst_overall{$sent}, ' ', substr($sent, 0, 70), "\n";
 }
diff --git a/ch04.txt b/ch04.txt
 File name                  : ch04-Schema_Optimization-clean.txt
 Number of characters       : 117260
 Number of words            : 19970
 Percent of complex words   : 13.28
 Average syllables per word : 1.5864
 Number of sentences        : 1068
 Average words per sentence : 18.6985
 Number of text lines       : 430
 Number of blank lines      : 430
 Number of paragraphs       : 430


 READABILITY INDICES

 Fog                        : 12.7934
 Flesch                     : 53.6441
 Flesch-Kincaid             : 10.4223
 ====================== tex_syllables
 68.0000 Another reason to create cache tables is for optimizing search and ret
 62.0000 This design may appeal to developers, because it lets them work in a c
 57.0000 By now, you can probably see the pattern: if a user wants to see both 
 57.0000 However, it's easy to get into trouble by over optimizing queries. For
 55.0000 Both floating point and DECIMAL types let you specify the desired prec
 54.0000 Once you choose a type, make sure you use the same type in all related
 53.0000 The most common way to denormalize data is to duplicate, or cache, cer
 53.0000 In most storage engines, an index can cover queries that access only c
 52.0000 An inexperienced user might think this identifies the column's role as
 52.0000 This query shows that increasing the prefix gives successively smaller
 51.0000 The easiest way to understand indexing concepts is with an illustratio
 50.0000 MySQL AB is working on improving this. Some of the upcoming improvemen
 50.0000 Some kinds of data don't correspond directly to a built in type. A tim
 50.0000 Schemas are often designed from E R (entity relationship) diagrams or 
 49.0000 Storage engines store B Tree indexes differently on disk. This can aff
 48.0000 Be very careful with completely “random” strings, such as those pr
 47.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
 47.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co
 47.0000 If the last_online restriction appears without the age restriction, or
 47.0000 For high concurrency workloads, inserting in primary key order can act
 46.0000 We assume the employee's name is unique and can be used as a primary k
 46.0000 * Tables built upon clustered indexes are subject to page splits when 
 46.0000 With four bytes, the most frequent prefixes occur quite a bit more oft
 46.0000 The data type's complexity is also important. For example, integer dat
 45.0000 Now that you've learned more than you ever wanted to know about indexi
 43.0000 This is the key reason for creating summary tables: to provide statist
 43.0000 Redundant indexes usually appear when people add indexes to the table.
 43.0000 # MySQL can't perform the LIKE operation in the index. This is a limit
 43.0000 When an index covers the query, you'll see “Using Index” in the Ex
 43.0000 Also in contrast to MyISAM, secondary indexes are very different from 
 42.0000 The terms “cache table” and “summary table” don't have a stand

 ====================== flesch
 -44.5100 * Normalized updates are usually faster than denormalized updates.
 -5.2585 The probability of a hash collision grows much faster than you might t
 -4.4375 Here's an example: if you design a product table with an ENUM field to
 8.1015 The differences between clustered and non clustered data layouts, and 
 8.3650 InnoDB supports transactions and four transaction isolation levels.
 11.1754 Clustering the data by the primary key can be very beneficial to the a
 13.1050 We've covered most important data type considerations, some of them wi
 14.6785 Good performance depends on both an optimal schema and optimal indexin
 15.7704 An inexperienced user might think this identifies the column's role as
 16.8000 These advantages can boost performance tremendously, if you design you
 18.1478 You can find a good prefix length by selecting a sample of the most fr
 19.1505 * Covering indexes are especially helpful for InnoDB tables because of
 21.3950 If the Extra column of EXPLAIN contains “Using temporary,” the que
 23.4511 Schemas are often designed from E R (entity relationship) diagrams or 
 24.0141 This design may appeal to developers, because it lets them work in a c
 24.3771 InnoDB stores the same data very differently because of its clustered 
 24.9186 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
 26.0775 Normalization generally means the degree to which each fact is represe
 26.3050 If you need quick lookups on multi dimensional data from any combinati
 26.3050 Another way to calculate a good prefix length is by computing selectiv
 26.6998 When you add indexes, try to extend existing indexes instead, because 
 26.9712 Any application that keeps counts in a table can eventually run into c
 27.2553 * Secondary (non clustered) indexes are larger because their leaf node
 27.4850 The table's data storage can also become fragmented. However, data sto
 27.5142 Because of the space and computational cost, you should use DECIMAL on
 27.5142 Covering indexes can be a very powerful tool, and can improve performa
 27.6102 Prefix indexes are a great way to make indexes smaller and faster in m
 28.0034 It's harder for MySQL to optimize queries that refer to nullable colum
 28.0071 Another frequent performance nightmare is Object Relational Mapping (O
 28.0500 * Pages become sparsely and irregularly filled because of splitting, s
 28.6543 These limitations make hash indexes useful only in special cases. Howe

 ====================== fog
 36.0000 Here's an example: if you design a product table with an ENUM field to
 33.6706 The probability of a hash collision grows much faster than you might t
 28.7172 You can find a good prefix length by selecting a sample of the most fr
 28.2000 * Normalized updates are usually faster than denormalized updates.
 27.9652 Clustering the data by the primary key can be very beneficial to the a
 27.8571 An inexperienced user might think this identifies the column's role as
 26.5412 The differences between clustered and non clustered data layouts, and 
 26.1306 Prefix indexes are a great way to make indexes smaller and faster in m
 23.5373 A hash index is built on a hash table.<ref name="ftn5">See the compute
 22.9333 These advantages can boost performance tremendously, if you design you
 22.6507 Redundant indexes usually appear when people add indexes to the table.
 22.0000 We've covered most important data type considerations, some of them wi
 21.5692 Adding and removing the LIMIT changes the query plan, which lets us tw
 21.5273 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
 21.5158 Another frequent performance nightmare is Object Relational Mapping (O
 21.4323 The only difference between the BLOB and TEXT families are that BLOB t
 21.2865 For example, if you store the value b'00111001' (which is the binary e
 21.2808 Be very careful with completely “random” strings, such as those pr
 21.2780 Any application that keeps counts in a table can eventually run into c
 21.1660 When you add indexes, try to extend existing indexes instead, because 
 21.0857 This works well because the MySQL query optimizer notices there's a sm
 20.5846 Good performance depends on both an optimal schema and optimal indexin
 20.3024 Even with the index, the query can be slow if the user interface is pa
 20.3000 Another way to calculate a good prefix length is by computing selectiv
 20.3000 If you need quick lookups on multi dimensional data from any combinati
 20.2154 The main reason to keep redundant indexes is when extending an existin
 20.0795 If someone talks about an index without mentioning a type, it's probab
 20.0000 Now suppose we add another WHERE criterion that we know will reduce th
 19.7091 * Covering indexes are especially helpful for InnoDB tables because of
 19.7026 The moral of the story is, you should strive to insert data in primary
 19.6308 Covering indexes can be a very powerful tool, and can improve performa

 ====================== num_words
 153.0000 Another reason to create cache tables is for optimizing search and ret
 127.0000 * Tables built upon clustered indexes are subject to page splits when 
 121.0000 Oddly enough, the first thing to decide is whether we have to use inde
 118.0000 Both floating point and DECIMAL types let you specify the desired prec
 113.0000 However, it's easy to get into trouble by over optimizing queries. For
 112.0000 By now, you can probably see the pattern: if a user wants to see both 
 112.0000 If the last_online restriction appears without the age restriction, or
 109.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co
 105.0000 We assume the employee's name is unique and can be used as a primary k
 105.0000 An inexperienced user might think this identifies the column's role as
 103.0000 Here's the trick: even if there's a query that doesn't restrict the re
 102.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
 99.0000 Be very careful with completely “random” strings, such as those pr
 98.0000 Once you choose a type, make sure you use the same type in all related
 97.0000 You may have noticed we're insisting on keeping the age column at the 
 94.0000 This example shows how InnoDB can lock rows it doesn't really need, ev
 94.0000 MySQL can sometimes use one index for both sorting and finding rows. T
 93.0000 The easiest way to understand indexing concepts is with an illustratio
 93.0000 This query shows that increasing the prefix gives successively smaller
 93.0000 B Tree indexes work well for lookups by the full key value, a key rang
 92.0000 The most common way to denormalize data is to duplicate, or cache, cer
 92.0000 One way to illustrate this is by showing many variations on a single q
 92.0000 If you require high performance, you must design your schema and index
 91.0000 Redundant keys are a bit different from duplicated keys. If there is a
 91.0000 Unlike some other database servers, you can't choose which index to cl
 89.0000 Because InnoDB locks rows only when it accesses them, an index can red
 89.0000 Some kinds of data don't correspond directly to a built in type. A tim
 88.0000 If you need to store many true or false values, consider MySQL's nativ
 86.0000 In most storage engines, an index can cover queries that access only c
 86.0000 When an index covers the query, you'll see “Using Index” in the Ex
 86.0000 Because it's really up to the storage engines to store the data, not a

 ====================== percent_complex_words
 62.5000 * Normalized updates are usually faster than denormalized updates.
 38.4615 Good performance depends on both an optimal schema and optimal indexin
 37.5000 InnoDB supports transactions and four transaction isolation levels.
 35.0000 B Tree indexes can become fragmented, which reduces performance. Fragm
 35.0000 We've covered most important data type considerations, some of them wi
 33.3333 * Insert speed depends heavily on inserting in clustered key (primary 
 33.3333 These advantages can boost performance tremendously, if you design you
 32.3529 The differences between clustered and non clustered data layouts, and 
 32.1429 In addition to the Memory storage engine's explicit hash indexes, the 
 31.8182 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
 30.7692 Your choice determines how MySQL stores the data, in memory and on dis
 30.0000 * Covering indexes can use the primary key values contained at the lea
 27.2727 * Covering indexes are especially helpful for InnoDB tables because of
 27.0270 It's harder for MySQL to optimize queries that refer to nullable colum
 26.7606 Storage engines store B Tree indexes differently on disk. This can aff
 26.5306 These limitations make hash indexes useful only in special cases. Howe
 26.4151 When you add indexes, try to extend existing indexes instead, because 
 26.3158 # Use REPAIR TABLE to build the table's indexes. This will build all i
 26.3158 We benchmarked two cases. The first is inserting into a userinfo table
 26.3158 * Secondary (non clustered) indexes are larger because their leaf node
 25.8065 The data type's complexity is also important. For example, integer dat
 25.0000 Sometimes you can create duplicate indexes without knowing it. For exa
 24.3243 Integer types can optionally have the UNSIGNED attribute, which disall
 24.2424 Schemas are often designed from E R (entity relationship) diagrams or 
 23.9130 Clustering the data by the primary key can be very beneficial to the a
 23.5294 People who ask for help with performance issues are frequently advised
 23.5294 Now you know why we said the column order is extremely important: thes
 23.3333 The table's data storage can also become fragmented. However, data sto
 23.0769 Covering indexes can be a very powerful tool, and can improve performa
 23.0769 Because of the space and computational cost, you should use DECIMAL on
 22.7848 This design may appeal to developers, because it lets them work in a c

 ====================== num_chars
 902.0000 Another reason to create cache tables is for optimizing search and ret
 720.0000 Both floating point and DECIMAL types let you specify the desired prec
 691.0000 * Tables built upon clustered indexes are subject to page splits when 
 654.0000 Oddly enough, the first thing to decide is whether we have to use inde
 637.0000 By now, you can probably see the pattern: if a user wants to see both 
 637.0000 However, it's easy to get into trouble by over optimizing queries. For
 627.0000 If the last_online restriction appears without the age restriction, or
 619.0000 Be very careful with completely “random” strings, such as those pr
 617.0000 Once you choose a type, make sure you use the same type in all related
 615.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co
 612.0000 We assume the employee's name is unique and can be used as a primary k
 611.0000 An inexperienced user might think this identifies the column's role as
 580.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
 566.0000 Here's the trick: even if there's a query that doesn't restrict the re
 566.0000 This query shows that increasing the prefix gives successively smaller
 560.0000 The most common way to denormalize data is to duplicate, or cache, cer
 557.0000 The easiest way to understand indexing concepts is with an illustratio
 547.0000 If you require high performance, you must design your schema and index
 541.0000 In most storage engines, an index can cover queries that access only c
 538.0000 Some kinds of data don't correspond directly to a built in type. A tim
 527.0000 When an index covers the query, you'll see “Using Index” in the Ex
 525.0000 You may have noticed we're insisting on keeping the age column at the 
 520.0000 Unlike some other database servers, you can't choose which index to cl
 516.0000 Because it's really up to the storage engines to store the data, not a
 513.0000 Also in contrast to MyISAM, secondary indexes are very different from 
 512.0000 MySQL can sometimes use one index for both sorting and finding rows. T
 511.0000 This example shows how InnoDB can lock rows it doesn't really need, ev
 511.0000 For high concurrency workloads, inserting in primary key order can act
 511.0000 If you need to store many true or false values, consider MySQL's nativ
 510.0000 MySQL AB is working on improving this. Some of the upcoming improvemen
 510.0000 This design may appeal to developers, because it lets them work in a c

 ====================== words_per_sentence
 80.0000 Here's an example: if you design a product table with an ENUM field to
 68.0000 The probability of a hash collision grows much faster than you might t
 58.0000 You can find a good prefix length by selecting a sample of the most fr
 52.5000 An inexperienced user might think this identifies the column's role as
 52.0000 Adding and removing the LIMIT changes the query plan, which lets us tw
 51.0000 A hash index is built on a hash table.<ref name="ftn5">See the compute
 49.0000 Prefix indexes are a great way to make indexes smaller and faster in m
 46.0000 Clustering the data by the primary key can be very beneficial to the a
 42.0000 This works well because the MySQL query optimizer notices there's a sm
 41.0000 Any application that keeps counts in a table can eventually run into c
 41.0000 Even with the index, the query can be slow if the user interface is pa
 40.0000 Now suppose we add another WHERE criterion that we know will reduce th
 39.0000 The main reason to keep redundant indexes is when extending an existin
 39.0000 The moral of the story is, you should strive to insert data in primary
 38.0000 * Index entries are usually much smaller than the full row size, so My
 38.0000 Another frequent performance nightmare is Object Relational Mapping (O
 37.0000 # MySQL is using the index for an ORDER BY with a LIMIT. The index let
 37.0000 For example, if you store the value b'00111001' (which is the binary e
 36.5000 If someone talks about an index without mentioning a type, it's probab
 36.0000 This index can help you find all people whose last name is Smith, and 
 35.5000 Redundant indexes usually appear when people add indexes to the table.
 35.0000 When MySQL performs an index scan, it is accessing the table in index 
 35.0000 We've glossed over some of the details, such as how many internal B Tr
 35.0000 This means you'll lose trailing spaces in CHAR columns, but not in VAR
 34.5000 If your table has many rows and CRC32() gives too many collisions, imp
 34.0000 Your only other options are offline engine specific repair utilities, 
 34.0000 Here's a second query that proves row 1 is locked, even though it didn
 34.0000 The differences between clustered and non clustered data layouts, and 
 34.0000 The performance improvement from changing NULL columns to NOT NULL is 
 33.5000 The MySQL query optimizer uses two API calls to ask the storage engine
 33.5000 Random values also cause caches to perform poorly for all types of que

 ====================== num_sentences
 9.0000 Both floating point and DECIMAL types let you specify the desired prec
 7.0000 Another reason to create cache tables is for optimizing search and ret
 7.0000 Some kinds of data don't correspond directly to a built in type. A tim
 6.0000 We assume the employee's name is unique and can be used as a primary k
 6.0000 This command gives quite a lot of index information, which the MySQL m
 6.0000 Here's the trick: even if there's a query that doesn't restrict the re
 6.0000 A covering index can't be just any kind of index. The index must store
 6.0000 Storage engines store B Tree indexes differently on disk. This can aff
 6.0000 Sometimes you can use an ENUM column instead of conventional string ty
 5.0000 In theory, MySQL could have skipped building a new table. The default 
 5.0000 The most common way to denormalize data is to duplicate, or cache, cer
 5.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
 5.0000 Corrupted indexes can cause queries to return wrong results, or even c
 5.0000 If the last_online restriction appears without the age restriction, or
 5.0000 Oddly enough, the first thing to decide is whether we have to use inde
 5.0000 The easiest way to understand indexing concepts is with an illustratio
 5.0000 The first thing to look at is response time. If your queries are takin
 5.0000 This example shows how InnoDB can lock rows it doesn't really need, ev
 5.0000 In most storage engines, an index can cover queries that access only c
 5.0000 For high concurrency workloads, inserting in primary key order can act
 5.0000 Also in contrast to MyISAM, secondary indexes are very different from 
 5.0000 * Tables built upon clustered indexes are subject to page splits when 
 5.0000 Unlike some other database servers, you can't choose which index to cl
 5.0000 You can often save space and get good performance by indexing the firs
 5.0000 FULLTEXT is a special type of index for MyISAM tables. It finds keywor
 5.0000 Maatkit ([http://maatkit.sourceforge.net/ http://maatkit.sourceforge.n
 5.0000 Before MySQL 5.0, BIT is just a synonym for TINYINT. In MySQL 5.0 and 
 5.0000 TIMESTAMP also has special properties DATETIME doesn't have. By defaul
 5.0000 However, there's another benefit to converting the columns. According 
 5.0000 MySQL supports quite a few string data types, with many variations on 
 5.0000 The DECIMAL type is for storing exact fractional numbers. In MySQL 5.0

 ====================== syllables_per_word
 2.8750 * Normalized updates are usually faster than denormalized updates.
 2.2500 InnoDB supports transactions and four transaction isolation levels.
 2.1154 Good performance depends on both an optimal schema and optimal indexin
 2.0500 We've covered most important data type considerations, some of them wi
 2.0000 The table's data storage can also become fragmented. However, data sto
 2.0000 If the Extra column of EXPLAIN contains “Using temporary,” the que
 1.9697 Schemas are often designed from E R (entity relationship) diagrams or 
 1.9592 These limitations make hash indexes useful only in special cases. Howe
 1.9583 These advantages can boost performance tremendously, if you design you
 1.9545 * Covering indexes are especially helpful for InnoDB tables because of
 1.9487 Your choice determines how MySQL stores the data, in memory and on dis
 1.9412 The differences between clustered and non clustered data layouts, and 
 1.9333 * Pages become sparsely and irregularly filled because of splitting, s
 1.9286 * Normalized tables are usually smaller, so they fit better in memory 
 1.9286 In addition to the Memory storage engine's explicit hash indexes, the 
 1.9241 This design may appeal to developers, because it lets them work in a c
 1.9167 * Insert speed depends heavily on inserting in clustered key (primary 
 1.9048 InnoDB stores the same data very differently because of its clustered 
 1.8947 * Secondary (non clustered) indexes are larger because their leaf node
 1.8919 It's harder for MySQL to optimize queries that refer to nullable colum
 1.8873 Storage engines store B Tree indexes differently on disk. This can aff
 1.8864 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
 1.8750 Figure 4 TODO shows a simplified B Tree index's structure.
 1.8667 Normalization generally means the degree to which each fact is represe
 1.8571 DECIMAL math in MySQL 5.0 and newer uses complex emulation, because CP
 1.8500 B Tree indexes can become fragmented, which reduces performance. Fragm
 1.8485 Notice the auto incrementing integer primary key. The second case is a
 1.8485 MySQL has a few storage types that use individual bits within a value 
 1.8421 We benchmarked two cases. The first is inserting into a userinfo table
 1.8400 Sometimes you should go beyond the role of a developer, and question t
 1.8387 The data type's complexity is also important. For example, integer dat

 ====================== kincaid
 33.7525 Here's an example: if you design a product table with an ENUM field to
 30.8859 The probability of a hash collision grows much faster than you might t
 25.1369 You can find a good prefix length by selecting a sample of the most fr
 24.1021 An inexperienced user might think this identifies the column's role as
 23.1283 Clustering the data by the primary key can be very beneficial to the a
 21.8843 A hash index is built on a hash table.<ref name="ftn5">See the compute
 21.5812 Prefix indexes are a great way to make indexes smaller and faster in m
 21.4550 * Normalized updates are usually faster than denormalized updates.
 21.2554 Adding and removing the LIMIT changes the query plan, which lets us tw
 20.5759 The differences between clustered and non clustered data layouts, and 
 19.6829 Any application that keeps counts in a table can eventually run into c
 18.7932 Another frequent performance nightmare is Object Relational Mapping (O
 18.4900 This works well because the MySQL query optimizer notices there's a sm
 18.2277 The main reason to keep redundant indexes is when extending an existin
 18.0325 Redundant indexes usually appear when people add indexes to the table.
 17.9561 Even with the index, the query can be slow if the user interface is pa
 17.8806 If someone talks about an index without mentioning a type, it's probab
 17.8616 * Index entries are usually much smaller than the full row size, so My
 17.7738 The moral of the story is, you should strive to insert data in primary
 17.7100 Now suppose we add another WHERE criterion that we know will reduce th
 17.5400 Another way to calculate a good prefix length is by computing selectiv
 17.5400 If you need quick lookups on multi dimensional data from any combinati
 17.4524 Here's a second query that proves row 1 is locked, even though it didn
 17.2771 This means you'll lose trailing spaces in CHAR columns, but not in VAR
 17.2004 The MySQL query optimizer uses two API calls to ask the storage engine
 17.1053 The performance improvement from changing NULL columns to NOT NULL is 
 17.0184 For example, if you store the value b'00111001' (which is the binary e
 16.9467 Be very careful with completely “random” strings, such as those pr
 16.9400 We've glossed over some of the details, such as how many internal B Tr
 16.8783 These advantages can boost performance tremendously, if you design you
 16.5891 There's a way to work around both problems with a combination of cleve

 ====================== syllables per sentence
 30 There is typically no reason to do so unless you want to have different types of indexes on the same column to satisfy different kinds of queriesref nameftn12An index is not necessarily a duplicate if its a different type of index there are often good reasons to have KEYcol and FULLTEXT KEYnameref
 29 Heres an example if you design a product table with an ENUM field to define the product type you might want a lookup table primary keyed on an identical ENUM field You could add columns to the lookup table for descriptive text to generate a glossary or to provide meaningful labels in a pull down menu on a website In this case youll want to use the ENUM as an identifier but for most purposes you should avoid doing so
 26 There is currently no way to de fragment InnoDB indexes as they cant be built by a sort in MySQL 50ref nameftn16The InnoDB developers are working on this problem at the time of writingref Even dropping and re creating them may result in fragmented indexes depending on the data
 26 We suggest you think carefully before trading performance for developer productivity and always test on a realistically large data set so you dont discover performance problems too late
 24 They slow INSERT because the inserted value has to go in a random location in indexes which can cause excessive page splits and random disk accesses and causes clustered index fragmentation for clustered storage engines such as InnoDB
 23 These indexes will satisfy the most frequently specified search queries but how can we design indexes for less common options such as has_pictures eye_color hair_color and education
 23 It shows average selectivity but its also important to account for worst case selectivity which will be different from the average unless the data is evenly distributed
 23 When you add indexes try to extend existing indexes instead because it is usually more efficient to maintain one multi column index than several single column indexes
 22 In most cases you dont want redundant indexes and should extend existing indexes rather than adding new ones but it is not as clear cut as with duplicate indexesthere are times when you need redundant indexes for performance reasons 
 22 The differences between clustered and non clustered data layouts and the corresponding differences between primary and secondary indexes can be confusing and surprising
 22 This illustrates a general principle when youre designing indexes keep in mind not only the kinds of indexes you need for existing queries but consider the queries to be candidates for optimization too
 22 This is a technique that makes indexes much faster to build and results in a compact index layout InnoDB currently builds its indexes a row at a time in primary key order which means the index trees arent built in optimal order and are fragmented
 21 Integer types can optionally have the UNSIGNED attribute which disallows negative values and approximately doubles the upper limit of positive values you can store
 21 For high concurrency workloads inserting in primary key order can actually create a single point of contention in InnoDB as it is currently implemented
 21 If MySQL knows itll need to access the row at any point while executing the query MySQL will always read the roweven when a better query plan is theoretically possibleand the index wont cover the queryref nameftn9This may change in future MySQL versions but it applies as of MySQL 50 and 51ref
 21 Because of the space and computational cost you should use DECIMAL only when you need exact results for fractional numbersfor example when performing financial operations
 20 If you need quick lookups on multi dimensional data from any combination of dimensions you might consider storing non spatial data into a geospatial type just so you can have spatial indexes
 20 For example if there is no index on category and theres an index on price a filesort may be better or worse than an index scan depending on the selectivity of the category column
 20 MyISAM uses prefix compression to reduce index size allowing more of the index to fit in memory and improving performance dramatically in some cases
 20 These queries often need a particular table and index structure which is different from the one you would use for general OLTP online transaction processing operations
 20 Example 2 where the second condition filter leaves only a small set of results after index filtering shows how effective the proposed optimization is five times better on our data
 20 Weve covered most important data type considerations some of them with serious performance implications others with just minor performance effects
 20 This is true in general adding new indexes may have a dramatic performance impact for INSERT UPDATE and DELETE operations especially if the new index causes you to hit memory limits
 20 Try to avoid the common mistake of creating indexes without knowing which queries will use them and consider whether all indexes together are an optimal configuration
 19 This type of fragmentation only affects some operations such as full table scans and clustered index range scans which normally benefit from a sequential data layout on disk
 19 Refer to computer science literature for a detailed explanation of B Tree indexesref This is the only index type all of MySQLs storage engines support
 19 We mention some other strategies to optimize counter operations on page TODO application level optimization I think counters in memcache and updating a row with RAND
 19 The optimizer uses the statistics to estimate the number of rows a query will examine which is important because MySQLs optimizer is cost based and the main cost metric is how much data the query will access
 18 For example DECIMAL18 9 will store 9 digits from each side of the decimal point using 9 bytes in total 4 for the digits before the decimal point one for the decimal point itself and 4 bytes for the digits after decimal point
 18 If someone talks about an index without mentioning a type its probably a B Tree index which uses a B Tree data structure to store its dataref nameftn3Its actually a BTree index since each leaf node contains a link to the next for fast range traversals through nodes
 18 MySQL can perform prefix match LIKE patterns in the index because MySQL can convert them to simple comparisons but the leading wildcard in the query makes it impossible for the storage engine to evaluate the match

 ====================== worst overall
 7 An inexperienced user might think this identifies the column's role as
 6 The differences between clustered and non clustered data layouts, and 
 5 This design may appeal to developers, because it lets them work in a c
 5 * Normalized updates are usually faster than denormalized updates.
 5 Be very careful with completely “random” strings, such as those pr
 5 These advantages can boost performance tremendously, if you design you
 5 Clustering the data by the primary key can be very beneficial to the a
 4 We assume the employee's name is unique and can be used as a primary k
 4 The easiest way to understand indexing concepts is with an illustratio
 4 * Tables built upon clustered indexes are subject to page splits when 
 4 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
 4 Another frequent performance nightmare is Object Relational Mapping (O
 4 Some kinds of data don't correspond directly to a built in type. A tim
 4 Here's an example: if you design a product table with an ENUM field to
 4 Another reason to create cache tables is for optimizing search and ret
 4 Redundant indexes usually appear when people add indexes to the table.
 4 Prefix indexes are a great way to make indexes smaller and faster in m
 4 We've covered most important data type considerations, some of them wi
 4 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
 4 If the last_online restriction appears without the age restriction, or
 4 The most common way to denormalize data is to duplicate, or cache, cer
 4 Good performance depends on both an optimal schema and optimal indexin
 4 The probability of a hash collision grows much faster than you might t
 4 Any application that keeps counts in a table can eventually run into c
 4 You can find a good prefix length by selecting a sample of the most fr
 4 * Covering indexes are especially helpful for InnoDB tables because of
 4 Storage engines store B Tree indexes differently on disk. This can aff
 4 Both floating point and DECIMAL types let you specify the desired prec
 4 Schemas are often designed from E R (entity relationship) diagrams or 
 4 In most storage engines, an index can cover queries that access only c
 3 Here's the trick: even if there's a query that doesn't restrict the re
diff --git a/clean_text.pl b/clean_text.pl
 #!/usr/bin/perl

 use strict;
 use warnings FATAL => 'all';

 use English qw(-no_match_vars);

 my $filename = shift or die "Need a filename";

 open my $fh, "<", $filename or die $OS_ERROR;
 (my $fileout = $filename) =~ s/.txt/-clean.txt/;
 open my $fo, ">", $fileout  or die $OS_ERROR;

 my $wasblank;
 while ( my $line = <$fh> ) {
   chomp $line;
   if ( !$line ) {
      $wasblank = 1;
      print $fo "\n" unless $wasblank;
      next;
   }
   next if $line =~ m/nowiki|<center>|'''|\||^ /;
   next unless $line =~ m/\s/ && $line =~ m/\./;
   $line =~ s!</?tt>|''!!g;
   $line =~ s/\n\n+/\n/g;
   $line =~ s/(\w)-(\w)/$1 $2/g;
   print $fo $line, "\n\n";
   $wasblank = 0;
 }

 close $fh;
 close $fo;
	#!/usr/bin/perl

	use strict;
	use warnings FATAL => 'all';

	use English qw(-no_match_vars);
	use Lingua::EN::Fathom;
	use TeX::Hyphen;
	use List::Util qw(min);

	my $filename = shift or die "Give me a filename";

	my $hyp = new TeX::Hyphen();
	# file => 'hyphen.tex',
	# style => 'czech',
	# leftmin => 2,
	# rightmin => 2,
	# my @points = $hyp->hyphenate($word);

	my $f = new Lingua::EN::Fathom;

	# First print out a report on the whole file.
	$f->analyse_file($filename);
	print($f->report);

	=pod
	Number of characters : 65102
	Number of words : 10707
	Percent of complex words : 15.36
	Average syllables per word : 1.6678
	Number of sentences : 579
	Average words per sentence : 18.4922
	Number of text lines : 246
	Number of blank lines : 248
	Number of paragraphs : 246


	READABILITY INDICES

	Fog : 13.5424
	Flesch : 46.9706
	Flesch-Kincaid : 11.3019
	=cut

	=pod
	# Print out the longest words and their frequency.
	my %words = $f->unique_words;
	my @longest = reverse sort {
	length($a) * $words{$a} <=> length($b) * $words{$b}
	} keys %words;
	@longest = grep { length($_) > 4 } @longest;
	foreach my $i ( 0 .. min(30, $#longest ) ) {
	printf("%-30s %5d\n", $longest[$i], $words{$longest[$i]});
	}
	=cut

	# Now find the longest and least readable paragraphs, and the sentences with the
	# most syllables. These metrics are a 1 if more is better, and a 0 else.
	my %metrics = (
	num_chars => 0,
	num_words => 0,
	percent_complex_words => 0,
	num_sentences => 0,
	syllables_per_word => 0,
	words_per_sentence => 0,
	fog => 0,
	flesch => 1,
	kincaid => 0,
	);

	my @stats;
	my %syll_for;
	open my $fi, "<", $filename or die $OS_ERROR;
	while ( my $line = <$fi> ) {
	chomp $line;
	next unless $line;

	my $st = { line => $line };
	$f->analyse_block($line);
	foreach my $metric ( keys %metrics ) {
	$st->{$metric} = $f->$metric();
	}

	# My own metric: syllables per paragraph and syllables per sentence.
	$st->{tex_syllables} = 0;
	foreach my $sentence ( split(/[.!?]\s+(?=[A-Z])/, $line ) ) {
	$sentence =~ s/[^\w\s]+//g;
	foreach my $word ( split(/\s+/, $sentence ) ) {
	my $hy = $hyp->hyphenate($word);
	$syll_for{$sentence} += $hy;
	$st->{tex_syllables} += $hy;
	}
	}

	push @stats, $st;
	}
	close $fi;

	my %worst_overall;

	foreach my $metric ( qw(tex_syllables), keys %metrics ) {
	my @top = sort { $a->{$metric} <=> $b->{$metric} } @stats;
	if ( !$metrics{$metric} ) {
	@top = reverse @top;
	}
	if ( $metric eq 'flesch' ) {
	@top = grep { $_->{flesch} != 0 } @top;
	}
	print "====================== $metric\n";
	foreach my $i ( 0 .. min(30, $#top) ) {
	my $st = $top[$i];
	printf("%6.4f %s\n", $st->{$metric}, substr($st->{line}, 0, 70));
	$worst_overall{$st->{line}}++;
	}
	print "\n";
	}

	# Syllables-per-sentence report.
	my @top = reverse sort { $syll_for{$a} <=> $syll_for{$b} } keys %syll_for;
	print "====================== syllables per sentence\n";
	foreach my $i ( 0 .. min(30, $#top) ) {
	my $sent = $top[$i];
	print $syll_for{$sent}, ' ', $sent, "\n";
	}

	# Lines that showed up as bad in most of the Fathom reports.
	@top = reverse sort { $worst_overall{$a} <=> $worst_overall{$b} } keys %worst_overall;
	print "\n====================== worst overall\n";
	foreach my $i ( 0 .. min(30, $#top) ) {
	my $sent = $top[$i];
	print $worst_overall{$sent}, ' ', substr($sent, 0, 70), "\n";
	}