Created
January 18, 2014 15:49
-
-
Save xaprb/8492226 to your computer and use it in GitHub Desktop.
Files that support http://www.xaprb.com/blog/2008/06/23/what-its-like-to-write-a-technical-book-continued/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings FATAL => 'all'; | |
use English qw(-no_match_vars); | |
use Lingua::EN::Fathom; | |
use TeX::Hyphen; | |
use List::Util qw(min); | |
my $filename = shift or die "Give me a filename"; | |
my $hyp = new TeX::Hyphen(); | |
# file => 'hyphen.tex', | |
# style => 'czech', | |
# leftmin => 2, | |
# rightmin => 2, | |
# my @points = $hyp->hyphenate($word); | |
my $f = new Lingua::EN::Fathom; | |
# First print out a report on the whole file. | |
$f->analyse_file($filename); | |
print($f->report); | |
=pod | |
Number of characters : 65102 | |
Number of words : 10707 | |
Percent of complex words : 15.36 | |
Average syllables per word : 1.6678 | |
Number of sentences : 579 | |
Average words per sentence : 18.4922 | |
Number of text lines : 246 | |
Number of blank lines : 248 | |
Number of paragraphs : 246 | |
READABILITY INDICES | |
Fog : 13.5424 | |
Flesch : 46.9706 | |
Flesch-Kincaid : 11.3019 | |
=cut | |
=pod | |
# Print out the longest words and their frequency. | |
my %words = $f->unique_words; | |
my @longest = reverse sort { | |
length($a) * $words{$a} <=> length($b) * $words{$b} | |
} keys %words; | |
@longest = grep { length($_) > 4 } @longest; | |
foreach my $i ( 0 .. min(30, $#longest ) ) { | |
printf("%-30s %5d\n", $longest[$i], $words{$longest[$i]}); | |
} | |
=cut | |
# Now find the longest and least readable paragraphs, and the sentences with the | |
# most syllables. These metrics are a 1 if more is better, and a 0 else. | |
my %metrics = ( | |
num_chars => 0, | |
num_words => 0, | |
percent_complex_words => 0, | |
num_sentences => 0, | |
syllables_per_word => 0, | |
words_per_sentence => 0, | |
fog => 0, | |
flesch => 1, | |
kincaid => 0, | |
); | |
my @stats; | |
my %syll_for; | |
open my $fi, "<", $filename or die $OS_ERROR; | |
while ( my $line = <$fi> ) { | |
chomp $line; | |
next unless $line; | |
my $st = { line => $line }; | |
$f->analyse_block($line); | |
foreach my $metric ( keys %metrics ) { | |
$st->{$metric} = $f->$metric(); | |
} | |
# My own metric: syllables per paragraph and syllables per sentence. | |
$st->{tex_syllables} = 0; | |
foreach my $sentence ( split(/[.!?]\s+(?=[A-Z])/, $line ) ) { | |
$sentence =~ s/[^\w\s]+//g; | |
foreach my $word ( split(/\s+/, $sentence ) ) { | |
my $hy = $hyp->hyphenate($word); | |
$syll_for{$sentence} += $hy; | |
$st->{tex_syllables} += $hy; | |
} | |
} | |
push @stats, $st; | |
} | |
close $fi; | |
my %worst_overall; | |
foreach my $metric ( qw(tex_syllables), keys %metrics ) { | |
my @top = sort { $a->{$metric} <=> $b->{$metric} } @stats; | |
if ( !$metrics{$metric} ) { | |
@top = reverse @top; | |
} | |
if ( $metric eq 'flesch' ) { | |
@top = grep { $_->{flesch} != 0 } @top; | |
} | |
print "====================== $metric\n"; | |
foreach my $i ( 0 .. min(30, $#top) ) { | |
my $st = $top[$i]; | |
printf("%6.4f %s\n", $st->{$metric}, substr($st->{line}, 0, 70)); | |
$worst_overall{$st->{line}}++; | |
} | |
print "\n"; | |
} | |
# Syllables-per-sentence report. | |
my @top = reverse sort { $syll_for{$a} <=> $syll_for{$b} } keys %syll_for; | |
print "====================== syllables per sentence\n"; | |
foreach my $i ( 0 .. min(30, $#top) ) { | |
my $sent = $top[$i]; | |
print $syll_for{$sent}, ' ', $sent, "\n"; | |
} | |
# Lines that showed up as bad in most of the Fathom reports. | |
@top = reverse sort { $worst_overall{$a} <=> $worst_overall{$b} } keys %worst_overall; | |
print "\n====================== worst overall\n"; | |
foreach my $i ( 0 .. min(30, $#top) ) { | |
my $sent = $top[$i]; | |
print $worst_overall{$sent}, ' ', substr($sent, 0, 70), "\n"; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File name : ch04-Schema_Optimization-clean.txt | |
Number of characters : 117260 | |
Number of words : 19970 | |
Percent of complex words : 13.28 | |
Average syllables per word : 1.5864 | |
Number of sentences : 1068 | |
Average words per sentence : 18.6985 | |
Number of text lines : 430 | |
Number of blank lines : 430 | |
Number of paragraphs : 430 | |
READABILITY INDICES | |
Fog : 12.7934 | |
Flesch : 53.6441 | |
Flesch-Kincaid : 10.4223 | |
====================== tex_syllables | |
68.0000 Another reason to create cache tables is for optimizing search and ret | |
62.0000 This design may appeal to developers, because it lets them work in a c | |
57.0000 By now, you can probably see the pattern: if a user wants to see both | |
57.0000 However, it's easy to get into trouble by over optimizing queries. For | |
55.0000 Both floating point and DECIMAL types let you specify the desired prec | |
54.0000 Once you choose a type, make sure you use the same type in all related | |
53.0000 The most common way to denormalize data is to duplicate, or cache, cer | |
53.0000 In most storage engines, an index can cover queries that access only c | |
52.0000 An inexperienced user might think this identifies the column's role as | |
52.0000 This query shows that increasing the prefix gives successively smaller | |
51.0000 The easiest way to understand indexing concepts is with an illustratio | |
50.0000 MySQL AB is working on improving this. Some of the upcoming improvemen | |
50.0000 Some kinds of data don't correspond directly to a built in type. A tim | |
50.0000 Schemas are often designed from E R (entity relationship) diagrams or | |
49.0000 Storage engines store B Tree indexes differently on disk. This can aff | |
48.0000 Be very careful with completely “random” strings, such as those pr | |
47.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no | |
47.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co | |
47.0000 If the last_online restriction appears without the age restriction, or | |
47.0000 For high concurrency workloads, inserting in primary key order can act | |
46.0000 We assume the employee's name is unique and can be used as a primary k | |
46.0000 * Tables built upon clustered indexes are subject to page splits when | |
46.0000 With four bytes, the most frequent prefixes occur quite a bit more oft | |
46.0000 The data type's complexity is also important. For example, integer dat | |
45.0000 Now that you've learned more than you ever wanted to know about indexi | |
43.0000 This is the key reason for creating summary tables: to provide statist | |
43.0000 Redundant indexes usually appear when people add indexes to the table. | |
43.0000 # MySQL can't perform the LIKE operation in the index. This is a limit | |
43.0000 When an index covers the query, you'll see “Using Index” in the Ex | |
43.0000 Also in contrast to MyISAM, secondary indexes are very different from | |
42.0000 The terms “cache table” and “summary table” don't have a stand | |
====================== flesch | |
-44.5100 * Normalized updates are usually faster than denormalized updates. | |
-5.2585 The probability of a hash collision grows much faster than you might t | |
-4.4375 Here's an example: if you design a product table with an ENUM field to | |
8.1015 The differences between clustered and non clustered data layouts, and | |
8.3650 InnoDB supports transactions and four transaction isolation levels. | |
11.1754 Clustering the data by the primary key can be very beneficial to the a | |
13.1050 We've covered most important data type considerations, some of them wi | |
14.6785 Good performance depends on both an optimal schema and optimal indexin | |
15.7704 An inexperienced user might think this identifies the column's role as | |
16.8000 These advantages can boost performance tremendously, if you design you | |
18.1478 You can find a good prefix length by selecting a sample of the most fr | |
19.1505 * Covering indexes are especially helpful for InnoDB tables because of | |
21.3950 If the Extra column of EXPLAIN contains “Using temporary,” the que | |
23.4511 Schemas are often designed from E R (entity relationship) diagrams or | |
24.0141 This design may appeal to developers, because it lets them work in a c | |
24.3771 InnoDB stores the same data very differently because of its clustered | |
24.9186 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY | |
26.0775 Normalization generally means the degree to which each fact is represe | |
26.3050 If you need quick lookups on multi dimensional data from any combinati | |
26.3050 Another way to calculate a good prefix length is by computing selectiv | |
26.6998 When you add indexes, try to extend existing indexes instead, because | |
26.9712 Any application that keeps counts in a table can eventually run into c | |
27.2553 * Secondary (non clustered) indexes are larger because their leaf node | |
27.4850 The table's data storage can also become fragmented. However, data sto | |
27.5142 Because of the space and computational cost, you should use DECIMAL on | |
27.5142 Covering indexes can be a very powerful tool, and can improve performa | |
27.6102 Prefix indexes are a great way to make indexes smaller and faster in m | |
28.0034 It's harder for MySQL to optimize queries that refer to nullable colum | |
28.0071 Another frequent performance nightmare is Object Relational Mapping (O | |
28.0500 * Pages become sparsely and irregularly filled because of splitting, s | |
28.6543 These limitations make hash indexes useful only in special cases. Howe | |
====================== fog | |
36.0000 Here's an example: if you design a product table with an ENUM field to | |
33.6706 The probability of a hash collision grows much faster than you might t | |
28.7172 You can find a good prefix length by selecting a sample of the most fr | |
28.2000 * Normalized updates are usually faster than denormalized updates. | |
27.9652 Clustering the data by the primary key can be very beneficial to the a | |
27.8571 An inexperienced user might think this identifies the column's role as | |
26.5412 The differences between clustered and non clustered data layouts, and | |
26.1306 Prefix indexes are a great way to make indexes smaller and faster in m | |
23.5373 A hash index is built on a hash table.<ref name="ftn5">See the compute | |
22.9333 These advantages can boost performance tremendously, if you design you | |
22.6507 Redundant indexes usually appear when people add indexes to the table. | |
22.0000 We've covered most important data type considerations, some of them wi | |
21.5692 Adding and removing the LIMIT changes the query plan, which lets us tw | |
21.5273 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY | |
21.5158 Another frequent performance nightmare is Object Relational Mapping (O | |
21.4323 The only difference between the BLOB and TEXT families are that BLOB t | |
21.2865 For example, if you store the value b'00111001' (which is the binary e | |
21.2808 Be very careful with completely “random” strings, such as those pr | |
21.2780 Any application that keeps counts in a table can eventually run into c | |
21.1660 When you add indexes, try to extend existing indexes instead, because | |
21.0857 This works well because the MySQL query optimizer notices there's a sm | |
20.5846 Good performance depends on both an optimal schema and optimal indexin | |
20.3024 Even with the index, the query can be slow if the user interface is pa | |
20.3000 Another way to calculate a good prefix length is by computing selectiv | |
20.3000 If you need quick lookups on multi dimensional data from any combinati | |
20.2154 The main reason to keep redundant indexes is when extending an existin | |
20.0795 If someone talks about an index without mentioning a type, it's probab | |
20.0000 Now suppose we add another WHERE criterion that we know will reduce th | |
19.7091 * Covering indexes are especially helpful for InnoDB tables because of | |
19.7026 The moral of the story is, you should strive to insert data in primary | |
19.6308 Covering indexes can be a very powerful tool, and can improve performa | |
====================== num_words | |
153.0000 Another reason to create cache tables is for optimizing search and ret | |
127.0000 * Tables built upon clustered indexes are subject to page splits when | |
121.0000 Oddly enough, the first thing to decide is whether we have to use inde | |
118.0000 Both floating point and DECIMAL types let you specify the desired prec | |
113.0000 However, it's easy to get into trouble by over optimizing queries. For | |
112.0000 By now, you can probably see the pattern: if a user wants to see both | |
112.0000 If the last_online restriction appears without the age restriction, or | |
109.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co | |
105.0000 We assume the employee's name is unique and can be used as a primary k | |
105.0000 An inexperienced user might think this identifies the column's role as | |
103.0000 Here's the trick: even if there's a query that doesn't restrict the re | |
102.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no | |
99.0000 Be very careful with completely “random” strings, such as those pr | |
98.0000 Once you choose a type, make sure you use the same type in all related | |
97.0000 You may have noticed we're insisting on keeping the age column at the | |
94.0000 This example shows how InnoDB can lock rows it doesn't really need, ev | |
94.0000 MySQL can sometimes use one index for both sorting and finding rows. T | |
93.0000 The easiest way to understand indexing concepts is with an illustratio | |
93.0000 This query shows that increasing the prefix gives successively smaller | |
93.0000 B Tree indexes work well for lookups by the full key value, a key rang | |
92.0000 The most common way to denormalize data is to duplicate, or cache, cer | |
92.0000 One way to illustrate this is by showing many variations on a single q | |
92.0000 If you require high performance, you must design your schema and index | |
91.0000 Redundant keys are a bit different from duplicated keys. If there is a | |
91.0000 Unlike some other database servers, you can't choose which index to cl | |
89.0000 Because InnoDB locks rows only when it accesses them, an index can red | |
89.0000 Some kinds of data don't correspond directly to a built in type. A tim | |
88.0000 If you need to store many true or false values, consider MySQL's nativ | |
86.0000 In most storage engines, an index can cover queries that access only c | |
86.0000 When an index covers the query, you'll see “Using Index” in the Ex | |
86.0000 Because it's really up to the storage engines to store the data, not a | |
====================== percent_complex_words | |
62.5000 * Normalized updates are usually faster than denormalized updates. | |
38.4615 Good performance depends on both an optimal schema and optimal indexin | |
37.5000 InnoDB supports transactions and four transaction isolation levels. | |
35.0000 B Tree indexes can become fragmented, which reduces performance. Fragm | |
35.0000 We've covered most important data type considerations, some of them wi | |
33.3333 * Insert speed depends heavily on inserting in clustered key (primary | |
33.3333 These advantages can boost performance tremendously, if you design you | |
32.3529 The differences between clustered and non clustered data layouts, and | |
32.1429 In addition to the Memory storage engine's explicit hash indexes, the | |
31.8182 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY | |
30.7692 Your choice determines how MySQL stores the data, in memory and on dis | |
30.0000 * Covering indexes can use the primary key values contained at the lea | |
27.2727 * Covering indexes are especially helpful for InnoDB tables because of | |
27.0270 It's harder for MySQL to optimize queries that refer to nullable colum | |
26.7606 Storage engines store B Tree indexes differently on disk. This can aff | |
26.5306 These limitations make hash indexes useful only in special cases. Howe | |
26.4151 When you add indexes, try to extend existing indexes instead, because | |
26.3158 # Use REPAIR TABLE to build the table's indexes. This will build all i | |
26.3158 We benchmarked two cases. The first is inserting into a userinfo table | |
26.3158 * Secondary (non clustered) indexes are larger because their leaf node | |
25.8065 The data type's complexity is also important. For example, integer dat | |
25.0000 Sometimes you can create duplicate indexes without knowing it. For exa | |
24.3243 Integer types can optionally have the UNSIGNED attribute, which disall | |
24.2424 Schemas are often designed from E R (entity relationship) diagrams or | |
23.9130 Clustering the data by the primary key can be very beneficial to the a | |
23.5294 People who ask for help with performance issues are frequently advised | |
23.5294 Now you know why we said the column order is extremely important: thes | |
23.3333 The table's data storage can also become fragmented. However, data sto | |
23.0769 Covering indexes can be a very powerful tool, and can improve performa | |
23.0769 Because of the space and computational cost, you should use DECIMAL on | |
22.7848 This design may appeal to developers, because it lets them work in a c | |
====================== num_chars | |
902.0000 Another reason to create cache tables is for optimizing search and ret | |
720.0000 Both floating point and DECIMAL types let you specify the desired prec | |
691.0000 * Tables built upon clustered indexes are subject to page splits when | |
654.0000 Oddly enough, the first thing to decide is whether we have to use inde | |
637.0000 By now, you can probably see the pattern: if a user wants to see both | |
637.0000 However, it's easy to get into trouble by over optimizing queries. For | |
627.0000 If the last_online restriction appears without the age restriction, or | |
619.0000 Be very careful with completely “random” strings, such as those pr | |
617.0000 Once you choose a type, make sure you use the same type in all related | |
615.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co | |
612.0000 We assume the employee's name is unique and can be used as a primary k | |
611.0000 An inexperienced user might think this identifies the column's role as | |
580.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no | |
566.0000 Here's the trick: even if there's a query that doesn't restrict the re | |
566.0000 This query shows that increasing the prefix gives successively smaller | |
560.0000 The most common way to denormalize data is to duplicate, or cache, cer | |
557.0000 The easiest way to understand indexing concepts is with an illustratio | |
547.0000 If you require high performance, you must design your schema and index | |
541.0000 In most storage engines, an index can cover queries that access only c | |
538.0000 Some kinds of data don't correspond directly to a built in type. A tim | |
527.0000 When an index covers the query, you'll see “Using Index” in the Ex | |
525.0000 You may have noticed we're insisting on keeping the age column at the | |
520.0000 Unlike some other database servers, you can't choose which index to cl | |
516.0000 Because it's really up to the storage engines to store the data, not a | |
513.0000 Also in contrast to MyISAM, secondary indexes are very different from | |
512.0000 MySQL can sometimes use one index for both sorting and finding rows. T | |
511.0000 This example shows how InnoDB can lock rows it doesn't really need, ev | |
511.0000 For high concurrency workloads, inserting in primary key order can act | |
511.0000 If you need to store many true or false values, consider MySQL's nativ | |
510.0000 MySQL AB is working on improving this. Some of the upcoming improvemen | |
510.0000 This design may appeal to developers, because it lets them work in a c | |
====================== words_per_sentence | |
80.0000 Here's an example: if you design a product table with an ENUM field to | |
68.0000 The probability of a hash collision grows much faster than you might t | |
58.0000 You can find a good prefix length by selecting a sample of the most fr | |
52.5000 An inexperienced user might think this identifies the column's role as | |
52.0000 Adding and removing the LIMIT changes the query plan, which lets us tw | |
51.0000 A hash index is built on a hash table.<ref name="ftn5">See the compute | |
49.0000 Prefix indexes are a great way to make indexes smaller and faster in m | |
46.0000 Clustering the data by the primary key can be very beneficial to the a | |
42.0000 This works well because the MySQL query optimizer notices there's a sm | |
41.0000 Any application that keeps counts in a table can eventually run into c | |
41.0000 Even with the index, the query can be slow if the user interface is pa | |
40.0000 Now suppose we add another WHERE criterion that we know will reduce th | |
39.0000 The main reason to keep redundant indexes is when extending an existin | |
39.0000 The moral of the story is, you should strive to insert data in primary | |
38.0000 * Index entries are usually much smaller than the full row size, so My | |
38.0000 Another frequent performance nightmare is Object Relational Mapping (O | |
37.0000 # MySQL is using the index for an ORDER BY with a LIMIT. The index let | |
37.0000 For example, if you store the value b'00111001' (which is the binary e | |
36.5000 If someone talks about an index without mentioning a type, it's probab | |
36.0000 This index can help you find all people whose last name is Smith, and | |
35.5000 Redundant indexes usually appear when people add indexes to the table. | |
35.0000 When MySQL performs an index scan, it is accessing the table in index | |
35.0000 We've glossed over some of the details, such as how many internal B Tr | |
35.0000 This means you'll lose trailing spaces in CHAR columns, but not in VAR | |
34.5000 If your table has many rows and CRC32() gives too many collisions, imp | |
34.0000 Your only other options are offline engine specific repair utilities, | |
34.0000 Here's a second query that proves row 1 is locked, even though it didn | |
34.0000 The differences between clustered and non clustered data layouts, and | |
34.0000 The performance improvement from changing NULL columns to NOT NULL is | |
33.5000 The MySQL query optimizer uses two API calls to ask the storage engine | |
33.5000 Random values also cause caches to perform poorly for all types of que | |
====================== num_sentences | |
9.0000 Both floating point and DECIMAL types let you specify the desired prec | |
7.0000 Another reason to create cache tables is for optimizing search and ret | |
7.0000 Some kinds of data don't correspond directly to a built in type. A tim | |
6.0000 We assume the employee's name is unique and can be used as a primary k | |
6.0000 This command gives quite a lot of index information, which the MySQL m | |
6.0000 Here's the trick: even if there's a query that doesn't restrict the re | |
6.0000 A covering index can't be just any kind of index. The index must store | |
6.0000 Storage engines store B Tree indexes differently on disk. This can aff | |
6.0000 Sometimes you can use an ENUM column instead of conventional string ty | |
5.0000 In theory, MySQL could have skipped building a new table. The default | |
5.0000 The most common way to denormalize data is to duplicate, or cache, cer | |
5.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no | |
5.0000 Corrupted indexes can cause queries to return wrong results, or even c | |
5.0000 If the last_online restriction appears without the age restriction, or | |
5.0000 Oddly enough, the first thing to decide is whether we have to use inde | |
5.0000 The easiest way to understand indexing concepts is with an illustratio | |
5.0000 The first thing to look at is response time. If your queries are takin | |
5.0000 This example shows how InnoDB can lock rows it doesn't really need, ev | |
5.0000 In most storage engines, an index can cover queries that access only c | |
5.0000 For high concurrency workloads, inserting in primary key order can act | |
5.0000 Also in contrast to MyISAM, secondary indexes are very different from | |
5.0000 * Tables built upon clustered indexes are subject to page splits when | |
5.0000 Unlike some other database servers, you can't choose which index to cl | |
5.0000 You can often save space and get good performance by indexing the firs | |
5.0000 FULLTEXT is a special type of index for MyISAM tables. It finds keywor | |
5.0000 Maatkit ([http://maatkit.sourceforge.net/ http://maatkit.sourceforge.n | |
5.0000 Before MySQL 5.0, BIT is just a synonym for TINYINT. In MySQL 5.0 and | |
5.0000 TIMESTAMP also has special properties DATETIME doesn't have. By defaul | |
5.0000 However, there's another benefit to converting the columns. According | |
5.0000 MySQL supports quite a few string data types, with many variations on | |
5.0000 The DECIMAL type is for storing exact fractional numbers. In MySQL 5.0 | |
====================== syllables_per_word | |
2.8750 * Normalized updates are usually faster than denormalized updates. | |
2.2500 InnoDB supports transactions and four transaction isolation levels. | |
2.1154 Good performance depends on both an optimal schema and optimal indexin | |
2.0500 We've covered most important data type considerations, some of them wi | |
2.0000 The table's data storage can also become fragmented. However, data sto | |
2.0000 If the Extra column of EXPLAIN contains “Using temporary,” the que | |
1.9697 Schemas are often designed from E R (entity relationship) diagrams or | |
1.9592 These limitations make hash indexes useful only in special cases. Howe | |
1.9583 These advantages can boost performance tremendously, if you design you | |
1.9545 * Covering indexes are especially helpful for InnoDB tables because of | |
1.9487 Your choice determines how MySQL stores the data, in memory and on dis | |
1.9412 The differences between clustered and non clustered data layouts, and | |
1.9333 * Pages become sparsely and irregularly filled because of splitting, s | |
1.9286 * Normalized tables are usually smaller, so they fit better in memory | |
1.9286 In addition to the Memory storage engine's explicit hash indexes, the | |
1.9241 This design may appeal to developers, because it lets them work in a c | |
1.9167 * Insert speed depends heavily on inserting in clustered key (primary | |
1.9048 InnoDB stores the same data very differently because of its clustered | |
1.8947 * Secondary (non clustered) indexes are larger because their leaf node | |
1.8919 It's harder for MySQL to optimize queries that refer to nullable colum | |
1.8873 Storage engines store B Tree indexes differently on disk. This can aff | |
1.8864 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY | |
1.8750 Figure 4 TODO shows a simplified B Tree index's structure. | |
1.8667 Normalization generally means the degree to which each fact is represe | |
1.8571 DECIMAL math in MySQL 5.0 and newer uses complex emulation, because CP | |
1.8500 B Tree indexes can become fragmented, which reduces performance. Fragm | |
1.8485 Notice the auto incrementing integer primary key. The second case is a | |
1.8485 MySQL has a few storage types that use individual bits within a value | |
1.8421 We benchmarked two cases. The first is inserting into a userinfo table | |
1.8400 Sometimes you should go beyond the role of a developer, and question t | |
1.8387 The data type's complexity is also important. For example, integer dat | |
====================== kincaid | |
33.7525 Here's an example: if you design a product table with an ENUM field to | |
30.8859 The probability of a hash collision grows much faster than you might t | |
25.1369 You can find a good prefix length by selecting a sample of the most fr | |
24.1021 An inexperienced user might think this identifies the column's role as | |
23.1283 Clustering the data by the primary key can be very beneficial to the a | |
21.8843 A hash index is built on a hash table.<ref name="ftn5">See the compute | |
21.5812 Prefix indexes are a great way to make indexes smaller and faster in m | |
21.4550 * Normalized updates are usually faster than denormalized updates. | |
21.2554 Adding and removing the LIMIT changes the query plan, which lets us tw | |
20.5759 The differences between clustered and non clustered data layouts, and | |
19.6829 Any application that keeps counts in a table can eventually run into c | |
18.7932 Another frequent performance nightmare is Object Relational Mapping (O | |
18.4900 This works well because the MySQL query optimizer notices there's a sm | |
18.2277 The main reason to keep redundant indexes is when extending an existin | |
18.0325 Redundant indexes usually appear when people add indexes to the table. | |
17.9561 Even with the index, the query can be slow if the user interface is pa | |
17.8806 If someone talks about an index without mentioning a type, it's probab | |
17.8616 * Index entries are usually much smaller than the full row size, so My | |
17.7738 The moral of the story is, you should strive to insert data in primary | |
17.7100 Now suppose we add another WHERE criterion that we know will reduce th | |
17.5400 Another way to calculate a good prefix length is by computing selectiv | |
17.5400 If you need quick lookups on multi dimensional data from any combinati | |
17.4524 Here's a second query that proves row 1 is locked, even though it didn | |
17.2771 This means you'll lose trailing spaces in CHAR columns, but not in VAR | |
17.2004 The MySQL query optimizer uses two API calls to ask the storage engine | |
17.1053 The performance improvement from changing NULL columns to NOT NULL is | |
17.0184 For example, if you store the value b'00111001' (which is the binary e | |
16.9467 Be very careful with completely “random” strings, such as those pr | |
16.9400 We've glossed over some of the details, such as how many internal B Tr | |
16.8783 These advantages can boost performance tremendously, if you design you | |
16.5891 There's a way to work around both problems with a combination of cleve | |
====================== syllables per sentence | |
30 There is typically no reason to do so unless you want to have different types of indexes on the same column to satisfy different kinds of queriesref nameftn12An index is not necessarily a duplicate if its a different type of index there are often good reasons to have KEYcol and FULLTEXT KEYnameref | |
29 Heres an example if you design a product table with an ENUM field to define the product type you might want a lookup table primary keyed on an identical ENUM field You could add columns to the lookup table for descriptive text to generate a glossary or to provide meaningful labels in a pull down menu on a website In this case youll want to use the ENUM as an identifier but for most purposes you should avoid doing so | |
26 There is currently no way to de fragment InnoDB indexes as they cant be built by a sort in MySQL 50ref nameftn16The InnoDB developers are working on this problem at the time of writingref Even dropping and re creating them may result in fragmented indexes depending on the data | |
26 We suggest you think carefully before trading performance for developer productivity and always test on a realistically large data set so you dont discover performance problems too late | |
24 They slow INSERT because the inserted value has to go in a random location in indexes which can cause excessive page splits and random disk accesses and causes clustered index fragmentation for clustered storage engines such as InnoDB | |
23 These indexes will satisfy the most frequently specified search queries but how can we design indexes for less common options such as has_pictures eye_color hair_color and education | |
23 It shows average selectivity but its also important to account for worst case selectivity which will be different from the average unless the data is evenly distributed | |
23 When you add indexes try to extend existing indexes instead because it is usually more efficient to maintain one multi column index than several single column indexes | |
22 In most cases you dont want redundant indexes and should extend existing indexes rather than adding new ones but it is not as clear cut as with duplicate indexesthere are times when you need redundant indexes for performance reasons | |
22 The differences between clustered and non clustered data layouts and the corresponding differences between primary and secondary indexes can be confusing and surprising | |
22 This illustrates a general principle when youre designing indexes keep in mind not only the kinds of indexes you need for existing queries but consider the queries to be candidates for optimization too | |
22 This is a technique that makes indexes much faster to build and results in a compact index layout InnoDB currently builds its indexes a row at a time in primary key order which means the index trees arent built in optimal order and are fragmented | |
21 Integer types can optionally have the UNSIGNED attribute which disallows negative values and approximately doubles the upper limit of positive values you can store | |
21 For high concurrency workloads inserting in primary key order can actually create a single point of contention in InnoDB as it is currently implemented | |
21 If MySQL knows itll need to access the row at any point while executing the query MySQL will always read the roweven when a better query plan is theoretically possibleand the index wont cover the queryref nameftn9This may change in future MySQL versions but it applies as of MySQL 50 and 51ref | |
21 Because of the space and computational cost you should use DECIMAL only when you need exact results for fractional numbersfor example when performing financial operations | |
20 If you need quick lookups on multi dimensional data from any combination of dimensions you might consider storing non spatial data into a geospatial type just so you can have spatial indexes | |
20 For example if there is no index on category and theres an index on price a filesort may be better or worse than an index scan depending on the selectivity of the category column | |
20 MyISAM uses prefix compression to reduce index size allowing more of the index to fit in memory and improving performance dramatically in some cases | |
20 These queries often need a particular table and index structure which is different from the one you would use for general OLTP online transaction processing operations | |
20 Example 2 where the second condition filter leaves only a small set of results after index filtering shows how effective the proposed optimization is five times better on our data | |
20 Weve covered most important data type considerations some of them with serious performance implications others with just minor performance effects | |
20 This is true in general adding new indexes may have a dramatic performance impact for INSERT UPDATE and DELETE operations especially if the new index causes you to hit memory limits | |
20 Try to avoid the common mistake of creating indexes without knowing which queries will use them and consider whether all indexes together are an optimal configuration | |
19 This type of fragmentation only affects some operations such as full table scans and clustered index range scans which normally benefit from a sequential data layout on disk | |
19 Refer to computer science literature for a detailed explanation of B Tree indexesref This is the only index type all of MySQLs storage engines support | |
19 We mention some other strategies to optimize counter operations on page TODO application level optimization I think counters in memcache and updating a row with RAND | |
19 The optimizer uses the statistics to estimate the number of rows a query will examine which is important because MySQLs optimizer is cost based and the main cost metric is how much data the query will access | |
18 For example DECIMAL18 9 will store 9 digits from each side of the decimal point using 9 bytes in total 4 for the digits before the decimal point one for the decimal point itself and 4 bytes for the digits after decimal point | |
18 If someone talks about an index without mentioning a type its probably a B Tree index which uses a B Tree data structure to store its dataref nameftn3Its actually a BTree index since each leaf node contains a link to the next for fast range traversals through nodes | |
18 MySQL can perform prefix match LIKE patterns in the index because MySQL can convert them to simple comparisons but the leading wildcard in the query makes it impossible for the storage engine to evaluate the match | |
====================== worst overall | |
7 An inexperienced user might think this identifies the column's role as | |
6 The differences between clustered and non clustered data layouts, and | |
5 This design may appeal to developers, because it lets them work in a c | |
5 * Normalized updates are usually faster than denormalized updates. | |
5 Be very careful with completely “random” strings, such as those pr | |
5 These advantages can boost performance tremendously, if you design you | |
5 Clustering the data by the primary key can be very beneficial to the a | |
4 We assume the employee's name is unique and can be used as a primary k | |
4 The easiest way to understand indexing concepts is with an illustratio | |
4 * Tables built upon clustered indexes are subject to page splits when | |
4 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY | |
4 Another frequent performance nightmare is Object Relational Mapping (O | |
4 Some kinds of data don't correspond directly to a built in type. A tim | |
4 Here's an example: if you design a product table with an ENUM field to | |
4 Another reason to create cache tables is for optimizing search and ret | |
4 Redundant indexes usually appear when people add indexes to the table. | |
4 Prefix indexes are a great way to make indexes smaller and faster in m | |
4 We've covered most important data type considerations, some of them wi | |
4 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no | |
4 If the last_online restriction appears without the age restriction, or | |
4 The most common way to denormalize data is to duplicate, or cache, cer | |
4 Good performance depends on both an optimal schema and optimal indexin | |
4 The probability of a hash collision grows much faster than you might t | |
4 Any application that keeps counts in a table can eventually run into c | |
4 You can find a good prefix length by selecting a sample of the most fr | |
4 * Covering indexes are especially helpful for InnoDB tables because of | |
4 Storage engines store B Tree indexes differently on disk. This can aff | |
4 Both floating point and DECIMAL types let you specify the desired prec | |
4 Schemas are often designed from E R (entity relationship) diagrams or | |
4 In most storage engines, an index can cover queries that access only c | |
3 Here's the trick: even if there's a query that doesn't restrict the re |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings FATAL => 'all'; | |
use English qw(-no_match_vars); | |
my $filename = shift or die "Need a filename"; | |
open my $fh, "<", $filename or die $OS_ERROR; | |
(my $fileout = $filename) =~ s/.txt/-clean.txt/; | |
open my $fo, ">", $fileout or die $OS_ERROR; | |
my $wasblank; | |
while ( my $line = <$fh> ) { | |
chomp $line; | |
if ( !$line ) { | |
$wasblank = 1; | |
print $fo "\n" unless $wasblank; | |
next; | |
} | |
next if $line =~ m/nowiki|<center>|'''|\||^ /; | |
next unless $line =~ m/\s/ && $line =~ m/\./; | |
$line =~ s!</?tt>|''!!g; | |
$line =~ s/\n\n+/\n/g; | |
$line =~ s/(\w)-(\w)/$1 $2/g; | |
print $fo $line, "\n\n"; | |
$wasblank = 0; | |
} | |
close $fh; | |
close $fo; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment