|
############################################################################# |
|
## data source definitions |
|
############################################################################# |
|
|
|
source users |
|
{ |
|
# data source type. mandatory, no default value |
|
# known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc |
|
type = mysql |
|
|
|
##################################################################### |
|
## SQL settings (for 'mysql' and 'pgsql' types) |
|
##################################################################### |
|
|
|
# some straightforward parameters for SQL source types |
|
sql_host = localhost |
|
sql_user = root |
|
sql_pass = root |
|
sql_db = my_database |
|
#sql_port = 3306 # optional, default is 3306 |
|
|
|
|
|
# MySQL specific client connection flags |
|
# optional, default is 0 |
|
# |
|
# mysql_connect_flags = 32 # enable compression |
|
|
|
|
|
# pre-query, executed before the main fetch query |
|
# multi-value, optional, default is empty list of queries |
|
# |
|
# sql_query_pre = SET NAMES utf8 |
|
# sql_query_pre = SET SESSION query_cache_type=OFF |
|
|
|
|
|
# main document fetch query |
|
# mandatory, integer document ID field MUST be the first selected column |
|
# sql_query = \ |
|
# SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \ |
|
# FROM documents |
|
|
|
sql_query = \ |
|
SELECT user_id+10*1 as source_id, user_id as item_id, 0 as type_id, user_first_name, user_last_name, user_name \ |
|
FROM user |
|
|
|
|
|
# joined/payload field fetch query |
|
# joined fields let you avoid (slow) JOIN and GROUP_CONCAT |
|
# payload fields let you attach custom per-keyword values (eg. for ranking) |
|
# |
|
# syntax is FIELD-NAME 'from' ( 'query' | 'payload-query' ); QUERY |
|
# joined field QUERY should return 2 columns (docid, text) |
|
# payload field QUERY should return 3 columns (docid, keyword, weight) |
|
# |
|
# REQUIRES that query results are in ascending document ID order! |
|
# multi-value, optional, default is empty list of queries |
|
# |
|
# sql_joined_field = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC |
|
# sql_joined_field = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC |
|
|
|
|
|
# range query setup, query that must return min and max ID values |
|
# optional, default is empty |
|
# |
|
# sql_query will need to reference $start and $end boundaries |
|
# if using ranged query: |
|
# |
|
# sql_query = \ |
|
# SELECT doc.id, doc.id AS group, doc.title, doc.data \ |
|
# FROM documents doc \ |
|
# WHERE id>=$start AND id<=$end |
|
# |
|
# sql_query_range = SELECT MIN(id),MAX(id) FROM documents |
|
|
|
|
|
# range query step |
|
# optional, default is 1024 |
|
# |
|
# sql_range_step = 1000 |
|
|
|
|
|
# unsigned integer attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# optional bit size can be specified, default is 32 |
|
# |
|
# sql_attr_uint = author_id |
|
# sql_attr_uint = forum_id:9 # 9 bits for forum_id |
|
sql_attr_uint = item_id |
|
sql_attr_uint = type_id |
|
|
|
# boolean attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# equivalent to sql_attr_uint with 1-bit size |
|
# |
|
# sql_attr_bool = is_deleted |
|
|
|
|
|
# bigint attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# declares a signed (unlike uint!) 64-bit attribute |
|
# |
|
# sql_attr_bigint = my_bigint_id |
|
|
|
|
|
# UNIX timestamp attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# similar to integer, but can also be used in date functions |
|
# |
|
# sql_attr_timestamp = posted_ts |
|
# sql_attr_timestamp = last_edited_ts |
|
#sql_attr_timestamp = date_added |
|
|
|
# string ordinal attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# sorts strings (bytewise), and stores their indexes in the sorted list |
|
# sorting by this attr is equivalent to sorting by the original strings |
|
# |
|
# sql_attr_str2ordinal = author_name |
|
|
|
|
|
# floating point attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# values are stored in single precision, 32-bit IEEE 754 format |
|
# |
|
# sql_attr_float = lat_radians |
|
# sql_attr_float = long_radians |
|
|
|
|
|
# multi-valued attribute (MVA) attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# MVA values are variable length lists of unsigned 32-bit integers |
|
# |
|
# syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY] |
|
# ATTR-TYPE is 'uint' or 'timestamp' |
|
# SOURCE-TYPE is 'field', 'query', or 'ranged-query' |
|
# QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs |
|
# RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range' |
|
# |
|
# sql_attr_multi = uint tag from query; SELECT docid, tagid FROM tags |
|
# sql_attr_multi = uint tag from ranged-query; \ |
|
# SELECT docid, tagid FROM tags WHERE id>=$start AND id<=$end; \ |
|
# SELECT MIN(docid), MAX(docid) FROM tags |
|
|
|
|
|
# string attribute declaration |
|
# multi-value (an arbitrary number of these is allowed), optional |
|
# lets you store and retrieve strings |
|
# |
|
# sql_attr_string = stitle |
|
|
|
|
|
# wordcount attribute declaration |
|
# multi-value (an arbitrary number of these is allowed), optional |
|
# lets you count the words at indexing time |
|
# |
|
# sql_attr_str2wordcount = stitle |
|
|
|
|
|
# combined field plus attribute declaration (from a single column) |
|
# stores column as an attribute, but also indexes it as a full-text field |
|
# |
|
# sql_field_string = author |
|
# sql_field_str2wordcount = title |
|
|
|
|
|
# post-query, executed on sql_query completion |
|
# optional, default is empty |
|
# |
|
# sql_query_post = |
|
|
|
|
|
# post-index-query, executed on successful indexing completion |
|
# optional, default is empty |
|
# $maxid expands to max document ID actually fetched from DB |
|
# |
|
# sql_query_post_index = REPLACE INTO counters ( id, val ) \ |
|
# VALUES ( 'max_indexed_id', $maxid ) |
|
|
|
|
|
# ranged query throttling, in milliseconds |
|
# optional, default is 0 which means no delay |
|
# enforces given delay before each query step |
|
sql_ranged_throttle = 0 |
|
|
|
# document info query, ONLY for CLI search (ie. testing and debugging) |
|
# optional, default is empty |
|
# must contain $id macro and must fetch the document by that id |
|
# sql_query_info = SELECT * FROM documents WHERE id=$id |
|
|
|
# kill-list query, fetches the document IDs for kill-list |
|
# k-list will suppress matches from preceding indexes in the same query |
|
# optional, default is empty |
|
# |
|
# sql_query_killlist = SELECT id FROM documents WHERE edited>=@last_reindex |
|
|
|
|
|
# columns to unpack on indexer side when indexing |
|
# multi-value, optional, default is empty list |
|
# |
|
# unpack_zlib = zlib_column |
|
# unpack_mysqlcompress = compressed_column |
|
# unpack_mysqlcompress = compressed_column_2 |
|
|
|
|
|
# maximum unpacked length allowed in MySQL COMPRESS() unpacker |
|
# optional, default is 16M |
|
# |
|
# unpack_mysqlcompress_maxsize = 16M |
|
} |
|
|
|
source posts |
|
{ |
|
# data source type. mandatory, no default value |
|
# known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc |
|
type = mysql |
|
|
|
##################################################################### |
|
## SQL settings (for 'mysql' and 'pgsql' types) |
|
##################################################################### |
|
|
|
# some straightforward parameters for SQL source types |
|
sql_host = localhost |
|
sql_user = root |
|
sql_pass = root |
|
sql_db = my_database |
|
#sql_port = 3306 # optional, default is 3306 |
|
|
|
|
|
# main document fetch query |
|
# mandatory, integer document ID field MUST be the first selected column |
|
# sql_query = \ |
|
# SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \ |
|
# FROM documents |
|
|
|
sql_query = \ |
|
SELECT post_id+10*2 as source_id, post_id as item_id, post_type_id as type_id, post_title, post_content, UNIX_TIMESTAMP(post_timestamp) as post_date \ |
|
FROM cl_posts \ |
|
WHERE post_deleted IS NULL |
|
|
|
|
|
# joined/payload field fetch query |
|
# joined fields let you avoid (slow) JOIN and GROUP_CONCAT |
|
# payload fields let you attach custom per-keyword values (eg. for ranking) |
|
# |
|
# syntax is FIELD-NAME 'from' ( 'query' | 'payload-query' ); QUERY |
|
# joined field QUERY should return 2 columns (docid, text) |
|
# payload field QUERY should return 3 columns (docid, keyword, weight) |
|
# |
|
# REQUIRES that query results are in ascending document ID order! |
|
# multi-value, optional, default is empty list of queries |
|
# |
|
# sql_joined_field = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC |
|
# sql_joined_field = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC |
|
|
|
|
|
|
|
# unsigned integer attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# optional bit size can be specified, default is 32 |
|
# |
|
# sql_attr_uint = author_id |
|
# sql_attr_uint = forum_id:9 # 9 bits for forum_id |
|
sql_attr_uint = item_id |
|
sql_attr_uint = type_id |
|
|
|
# boolean attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# equivalent to sql_attr_uint with 1-bit size |
|
# |
|
# sql_attr_bool = is_deleted |
|
|
|
|
|
# bigint attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# declares a signed (unlike uint!) 64-bit attribute |
|
# |
|
# sql_attr_bigint = my_bigint_id |
|
|
|
|
|
# UNIX timestamp attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# similar to integer, but can also be used in date functions |
|
# |
|
# sql_attr_timestamp = posted_ts |
|
# sql_attr_timestamp = last_edited_ts |
|
sql_attr_timestamp = post_date |
|
|
|
# string ordinal attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# sorts strings (bytewise), and stores their indexes in the sorted list |
|
# sorting by this attr is equivalent to sorting by the original strings |
|
# |
|
# sql_attr_str2ordinal = author_name |
|
|
|
|
|
# floating point attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# values are stored in single precision, 32-bit IEEE 754 format |
|
# |
|
# sql_attr_float = lat_radians |
|
# sql_attr_float = long_radians |
|
|
|
|
|
# multi-valued attribute (MVA) attribute declaration |
|
# multi-value (an arbitrary number of attributes is allowed), optional |
|
# MVA values are variable length lists of unsigned 32-bit integers |
|
# |
|
# syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY] |
|
# ATTR-TYPE is 'uint' or 'timestamp' |
|
# SOURCE-TYPE is 'field', 'query', or 'ranged-query' |
|
# QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs |
|
# RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range' |
|
# |
|
# sql_attr_multi = uint tag from query; SELECT docid, tagid FROM tags |
|
# sql_attr_multi = uint tag from ranged-query; \ |
|
# SELECT docid, tagid FROM tags WHERE id>=$start AND id<=$end; \ |
|
# SELECT MIN(docid), MAX(docid) FROM tags |
|
sql_attr_multi = uint categories from query; SELECT post_id, category_id FROM cl_post_categories |
|
|
|
|
|
# string attribute declaration |
|
# multi-value (an arbitrary number of these is allowed), optional |
|
# lets you store and retrieve strings |
|
# |
|
# sql_attr_string = stitle |
|
|
|
|
|
# wordcount attribute declaration |
|
# multi-value (an arbitrary number of these is allowed), optional |
|
# lets you count the words at indexing time |
|
# |
|
# sql_attr_str2wordcount = stitle |
|
|
|
|
|
# combined field plus attribute declaration (from a single column) |
|
# stores column as an attribute, but also indexes it as a full-text field |
|
# |
|
# sql_field_string = author |
|
# sql_field_str2wordcount = title |
|
|
|
|
|
# post-query, executed on sql_query completion |
|
# optional, default is empty |
|
# |
|
# sql_query_post = |
|
|
|
|
|
# post-index-query, executed on successful indexing completion |
|
# optional, default is empty |
|
# $maxid expands to max document ID actually fetched from DB |
|
# |
|
# sql_query_post_index = REPLACE INTO counters ( id, val ) \ |
|
# VALUES ( 'max_indexed_id', $maxid ) |
|
|
|
|
|
# ranged query throttling, in milliseconds |
|
# optional, default is 0 which means no delay |
|
# enforces given delay before each query step |
|
sql_ranged_throttle = 0 |
|
} |
|
|
|
|
|
############################################################################# |
|
## index definitions |
|
############################################################################# |
|
|
|
# local index example |
|
# |
|
# this is an index which is stored locally in the filesystem |
|
# |
|
# all indexing-time options (such as morphology and charsets) |
|
# are configured per local index |
|
index userindex |
|
{ |
|
# index type |
|
# optional, default is 'plain' |
|
# known values are 'plain', 'distributed', and 'rt' (see samples below) |
|
# type = plain |
|
|
|
# document source(s) to index |
|
# multi-value, mandatory |
|
# document IDs must be globally unique across all sources |
|
source = users |
|
|
|
# index files path and file name, without extension |
|
# mandatory, path must be writable, extensions will be auto-appended |
|
path = /var/lib/sphinxsearch/data/userindex |
|
|
|
# document attribute values (docinfo) storage mode |
|
# optional, default is 'extern' |
|
# known values are 'none', 'extern' and 'inline' |
|
docinfo = extern |
|
|
|
# memory locking for cached data (.spa and .spi), to prevent swapping |
|
# optional, default is 0 (do not mlock) |
|
# requires searchd to be run from root |
|
mlock = 0 |
|
|
|
# a list of morphology preprocessors to apply |
|
# optional, default is empty |
|
# |
|
# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru', |
|
# 'soundex', and 'metaphone'; additional preprocessors available from |
|
# libstemmer are 'libstemmer_XXX', where XXX is algorithm code |
|
# (see libstemmer_c/libstemmer/modules.txt) |
|
# |
|
# morphology = stem_en, stem_ru, soundex |
|
# morphology = libstemmer_german |
|
# morphology = libstemmer_sv |
|
morphology = none |
|
|
|
# minimum word length at which to enable stemming |
|
# optional, default is 1 (stem everything) |
|
# |
|
# min_stemming_len = 1 |
|
|
|
|
|
# stopword files list (space separated) |
|
# optional, default is empty |
|
# contents are plain text, charset_table and stemming are both applied |
|
# |
|
# stopwords = /var/lib/sphinxsearch/data/stopwords.txt |
|
|
|
|
|
# wordforms file, in "mapfrom > mapto" plain text format |
|
# optional, default is empty |
|
# |
|
# wordforms = /var/lib/sphinxsearch/data/wordforms.txt |
|
|
|
|
|
# tokenizing exceptions file |
|
# optional, default is empty |
|
# |
|
# plain text, case sensitive, space insensitive in map-from part |
|
# one "Map Several Words => ToASingleOne" entry per line |
|
# |
|
# exceptions = /var/lib/sphinxsearch/data/exceptions.txt |
|
|
|
|
|
# minimum indexed word length |
|
# default is 1 (index everything) |
|
min_word_len = 3 |
|
|
|
# charset encoding type |
|
# optional, default is 'sbcs' |
|
# known types are 'sbcs' (Single Byte CharSet) and 'utf-8' |
|
charset_type = sbcs |
|
|
|
# charset definition and case folding rules "table" |
|
# optional, default value depends on charset_type |
|
# |
|
# defaults are configured to include English and Russian characters only |
|
# you need to change the table to include additional ones |
|
# this behavior MAY change in future versions |
|
# |
|
# 'sbcs' default value is |
|
# charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF |
|
# |
|
# 'utf-8' default value is |
|
# charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F |
|
|
|
|
|
# ignored characters list |
|
# optional, default value is empty |
|
# |
|
# ignore_chars = U+00AD |
|
|
|
|
|
# minimum word prefix length to index |
|
# optional, default is 0 (do not index prefixes) |
|
# |
|
# min_prefix_len = 3 |
|
|
|
|
|
# minimum word infix length to index |
|
# optional, default is 0 (do not index infixes) |
|
# |
|
min_infix_len = 3 |
|
|
|
|
|
# list of fields to limit prefix/infix indexing to |
|
# optional, default value is empty (index all fields in prefix/infix mode) |
|
# |
|
# prefix_fields = filename |
|
# infix_fields = url, domain |
|
|
|
|
|
# enable star-syntax (wildcards) when searching prefix/infix indexes |
|
# search-time only, does not affect indexing, can be 0 or 1 |
|
# optional, default is 0 (do not use wildcard syntax) |
|
# |
|
enable_star = 1 |
|
|
|
|
|
# expand keywords with exact forms and/or stars when searching fit indexes |
|
# search-time only, does not affect indexing, can be 0 or 1 |
|
# optional, default is 0 (do not expand keywords) |
|
# |
|
# expand_keywords = 1 |
|
|
|
|
|
# n-gram length to index, for CJK indexing |
|
# only supports 0 and 1 for now, other lengths to be implemented |
|
# optional, default is 0 (disable n-grams) |
|
# |
|
# ngram_len = 1 |
|
|
|
|
|
# n-gram characters list, for CJK indexing |
|
# optional, default is empty |
|
# |
|
# ngram_chars = U+3000..U+2FA1F |
|
|
|
|
|
# phrase boundary characters list |
|
# optional, default is empty |
|
# |
|
# phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis |
|
|
|
|
|
# phrase boundary word position increment |
|
# optional, default is 0 |
|
# |
|
# phrase_boundary_step = 100 |
|
|
|
|
|
# blended characters list |
|
# blended chars are indexed both as separators and valid characters |
|
# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t") |
|
# optional, default is empty |
|
# |
|
# blend_chars = +, &, U+23 |
|
|
|
|
|
# blended token indexing mode |
|
# a comma separated list of blended token indexing variants |
|
# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure |
|
# optional, default is trim_none |
|
# |
|
# blend_mode = trim_tail, skip_pure |
|
|
|
|
|
# whether to strip HTML tags from incoming documents |
|
# known values are 0 (do not strip) and 1 (do strip) |
|
# optional, default is 0 |
|
html_strip = 0 |
|
|
|
# what HTML attributes to index if stripping HTML |
|
# optional, default is empty (do not index anything) |
|
# |
|
# html_index_attrs = img=alt,title; a=title; |
|
|
|
|
|
# what HTML elements contents to strip |
|
# optional, default is empty (do not strip element contents) |
|
# |
|
# html_remove_elements = style, script |
|
|
|
|
|
# whether to preopen index data files on startup |
|
# optional, default is 0 (do not preopen), searchd-only |
|
# |
|
# preopen = 1 |
|
|
|
|
|
# whether to keep dictionary (.spi) on disk, or cache it in RAM |
|
# optional, default is 0 (cache in RAM), searchd-only |
|
# |
|
# ondisk_dict = 1 |
|
|
|
|
|
# whether to enable in-place inversion (2x less disk, 90-95% speed) |
|
# optional, default is 0 (use separate temporary files), indexer-only |
|
# |
|
# inplace_enable = 1 |
|
|
|
|
|
# in-place fine-tuning options |
|
# optional, defaults are listed below |
|
# |
|
# inplace_hit_gap = 0 # preallocated hitlist gap size |
|
# inplace_docinfo_gap = 0 # preallocated docinfo gap size |
|
# inplace_reloc_factor = 0.1 # relocation buffer size within arena |
|
# inplace_write_factor = 0.1 # write buffer size within arena |
|
|
|
|
|
# whether to index original keywords along with stemmed versions |
|
# enables "=exactform" operator to work |
|
# optional, default is 0 |
|
# |
|
# index_exact_words = 1 |
|
|
|
|
|
# position increment on overshort (less that min_word_len) words |
|
# optional, allowed values are 0 and 1, default is 1 |
|
# |
|
# overshort_step = 1 |
|
|
|
|
|
# position increment on stopword |
|
# optional, allowed values are 0 and 1, default is 1 |
|
# |
|
# stopword_step = 1 |
|
|
|
|
|
# hitless words list |
|
# positions for these keywords will not be stored in the index |
|
# optional, allowed values are 'all', or a list file name |
|
# |
|
# hitless_words = all |
|
# hitless_words = hitless.txt |
|
|
|
|
|
# detect and index sentence and paragraph boundaries |
|
# required for the SENTENCE and PARAGRAPH operators to work |
|
# optional, allowed values are 0 and 1, default is 0 |
|
# |
|
# index_sp = 1 |
|
|
|
|
|
# index zones, delimited by HTML/XML tags |
|
# a comma separated list of tags and wildcards |
|
# required for the ZONE operator to work |
|
# optional, default is empty string (do not index zones) |
|
# |
|
# index_zones = title, h*, th |
|
} |
|
|
|
index postindex |
|
{ |
|
# index type |
|
# optional, default is 'plain' |
|
# known values are 'plain', 'distributed', and 'rt' (see samples below) |
|
# type = plain |
|
|
|
# document source(s) to index |
|
# multi-value, mandatory |
|
# document IDs must be globally unique across all sources |
|
source = posts |
|
|
|
# index files path and file name, without extension |
|
# mandatory, path must be writable, extensions will be auto-appended |
|
path = /var/lib/sphinxsearch/data/postindex |
|
|
|
# document attribute values (docinfo) storage mode |
|
# optional, default is 'extern' |
|
# known values are 'none', 'extern' and 'inline' |
|
docinfo = extern |
|
|
|
# memory locking for cached data (.spa and .spi), to prevent swapping |
|
# optional, default is 0 (do not mlock) |
|
# requires searchd to be run from root |
|
mlock = 0 |
|
|
|
# a list of morphology preprocessors to apply |
|
# optional, default is empty |
|
# |
|
# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru', |
|
# 'soundex', and 'metaphone'; additional preprocessors available from |
|
# libstemmer are 'libstemmer_XXX', where XXX is algorithm code |
|
# (see libstemmer_c/libstemmer/modules.txt) |
|
# |
|
# morphology = stem_en, stem_ru, soundex |
|
# morphology = libstemmer_german |
|
# morphology = libstemmer_sv |
|
morphology = none |
|
|
|
# minimum word length at which to enable stemming |
|
# optional, default is 1 (stem everything) |
|
# |
|
# min_stemming_len = 1 |
|
|
|
|
|
# stopword files list (space separated) |
|
# optional, default is empty |
|
# contents are plain text, charset_table and stemming are both applied |
|
# |
|
# stopwords = /var/lib/sphinxsearch/data/stopwords.txt |
|
|
|
|
|
# wordforms file, in "mapfrom > mapto" plain text format |
|
# optional, default is empty |
|
# |
|
# wordforms = /var/lib/sphinxsearch/data/wordforms.txt |
|
|
|
|
|
# tokenizing exceptions file |
|
# optional, default is empty |
|
# |
|
# plain text, case sensitive, space insensitive in map-from part |
|
# one "Map Several Words => ToASingleOne" entry per line |
|
# |
|
# exceptions = /var/lib/sphinxsearch/data/exceptions.txt |
|
|
|
|
|
# minimum indexed word length |
|
# default is 1 (index everything) |
|
min_word_len = 3 |
|
|
|
# charset encoding type |
|
# optional, default is 'sbcs' |
|
# known types are 'sbcs' (Single Byte CharSet) and 'utf-8' |
|
charset_type = sbcs |
|
|
|
# charset definition and case folding rules "table" |
|
# optional, default value depends on charset_type |
|
# |
|
# defaults are configured to include English and Russian characters only |
|
# you need to change the table to include additional ones |
|
# this behavior MAY change in future versions |
|
# |
|
# 'sbcs' default value is |
|
# charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF |
|
# |
|
# 'utf-8' default value is |
|
# charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F |
|
|
|
|
|
# ignored characters list |
|
# optional, default value is empty |
|
# |
|
# ignore_chars = U+00AD |
|
|
|
|
|
# minimum word prefix length to index |
|
# optional, default is 0 (do not index prefixes) |
|
# |
|
# min_prefix_len = 3 |
|
|
|
|
|
# minimum word infix length to index |
|
# optional, default is 0 (do not index infixes) |
|
# |
|
# min_infix_len = 3 |
|
|
|
|
|
# list of fields to limit prefix/infix indexing to |
|
# optional, default value is empty (index all fields in prefix/infix mode) |
|
# |
|
# prefix_fields = filename |
|
# infix_fields = url, domain |
|
|
|
|
|
# enable star-syntax (wildcards) when searching prefix/infix indexes |
|
# search-time only, does not affect indexing, can be 0 or 1 |
|
# optional, default is 0 (do not use wildcard syntax) |
|
# |
|
# enable_star = 1 |
|
|
|
|
|
# expand keywords with exact forms and/or stars when searching fit indexes |
|
# search-time only, does not affect indexing, can be 0 or 1 |
|
# optional, default is 0 (do not expand keywords) |
|
# |
|
# expand_keywords = 1 |
|
|
|
|
|
# n-gram length to index, for CJK indexing |
|
# only supports 0 and 1 for now, other lengths to be implemented |
|
# optional, default is 0 (disable n-grams) |
|
# |
|
# ngram_len = 1 |
|
|
|
|
|
# n-gram characters list, for CJK indexing |
|
# optional, default is empty |
|
# |
|
# ngram_chars = U+3000..U+2FA1F |
|
|
|
|
|
# phrase boundary characters list |
|
# optional, default is empty |
|
# |
|
# phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis |
|
|
|
|
|
# phrase boundary word position increment |
|
# optional, default is 0 |
|
# |
|
# phrase_boundary_step = 100 |
|
|
|
|
|
# blended characters list |
|
# blended chars are indexed both as separators and valid characters |
|
# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t") |
|
# optional, default is empty |
|
# |
|
# blend_chars = +, &, U+23 |
|
|
|
|
|
# blended token indexing mode |
|
# a comma separated list of blended token indexing variants |
|
# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure |
|
# optional, default is trim_none |
|
# |
|
# blend_mode = trim_tail, skip_pure |
|
|
|
|
|
# whether to strip HTML tags from incoming documents |
|
# known values are 0 (do not strip) and 1 (do strip) |
|
# optional, default is 0 |
|
html_strip = 0 |
|
|
|
# what HTML attributes to index if stripping HTML |
|
# optional, default is empty (do not index anything) |
|
# |
|
# html_index_attrs = img=alt,title; a=title; |
|
|
|
|
|
# what HTML elements contents to strip |
|
# optional, default is empty (do not strip element contents) |
|
# |
|
# html_remove_elements = style, script |
|
|
|
|
|
# whether to preopen index data files on startup |
|
# optional, default is 0 (do not preopen), searchd-only |
|
# |
|
# preopen = 1 |
|
|
|
|
|
# whether to keep dictionary (.spi) on disk, or cache it in RAM |
|
# optional, default is 0 (cache in RAM), searchd-only |
|
# |
|
# ondisk_dict = 1 |
|
|
|
|
|
# whether to enable in-place inversion (2x less disk, 90-95% speed) |
|
# optional, default is 0 (use separate temporary files), indexer-only |
|
# |
|
# inplace_enable = 1 |
|
|
|
|
|
# in-place fine-tuning options |
|
# optional, defaults are listed below |
|
# |
|
# inplace_hit_gap = 0 # preallocated hitlist gap size |
|
# inplace_docinfo_gap = 0 # preallocated docinfo gap size |
|
# inplace_reloc_factor = 0.1 # relocation buffer size within arena |
|
# inplace_write_factor = 0.1 # write buffer size within arena |
|
|
|
|
|
# whether to index original keywords along with stemmed versions |
|
# enables "=exactform" operator to work |
|
# optional, default is 0 |
|
# |
|
# index_exact_words = 1 |
|
|
|
|
|
# position increment on overshort (less that min_word_len) words |
|
# optional, allowed values are 0 and 1, default is 1 |
|
# |
|
# overshort_step = 1 |
|
|
|
|
|
# position increment on stopword |
|
# optional, allowed values are 0 and 1, default is 1 |
|
# |
|
# stopword_step = 1 |
|
|
|
|
|
# hitless words list |
|
# positions for these keywords will not be stored in the index |
|
# optional, allowed values are 'all', or a list file name |
|
# |
|
# hitless_words = all |
|
# hitless_words = hitless.txt |
|
|
|
|
|
# detect and index sentence and paragraph boundaries |
|
# required for the SENTENCE and PARAGRAPH operators to work |
|
# optional, allowed values are 0 and 1, default is 0 |
|
# |
|
# index_sp = 1 |
|
|
|
|
|
# index zones, delimited by HTML/XML tags |
|
# a comma separated list of tags and wildcards |
|
# required for the ZONE operator to work |
|
# optional, default is empty string (do not index zones) |
|
# |
|
# index_zones = title, h*, th |
|
} |
|
|
|
############################################################################# |
|
## indexer settings |
|
############################################################################# |
|
|
|
indexer |
|
{ |
|
# memory limit, in bytes, kiloytes (16384K) or megabytes (256M) |
|
# optional, default is 32M, max is 2047M, recommended is 256M to 1024M |
|
mem_limit = 256M |
|
|
|
# maximum IO calls per second (for I/O throttling) |
|
# optional, default is 0 (unlimited) |
|
# |
|
# max_iops = 40 |
|
|
|
|
|
# maximum IO call size, bytes (for I/O throttling) |
|
# optional, default is 0 (unlimited) |
|
# |
|
# max_iosize = 1048576 |
|
|
|
|
|
# maximum xmlpipe2 field length, bytes |
|
# optional, default is 2M |
|
# |
|
# max_xmlpipe2_field = 4M |
|
|
|
|
|
# write buffer size, bytes |
|
# several (currently up to 4) buffers will be allocated |
|
# write buffers are allocated in addition to mem_limit |
|
# optional, default is 1M |
|
# |
|
# write_buffer = 1M |
|
|
|
|
|
# maximum file field adaptive buffer size |
|
# optional, default is 8M, minimum is 1M |
|
# |
|
# max_file_field_buffer = 32M |
|
} |
|
|
|
############################################################################# |
|
## searchd settings |
|
############################################################################# |
|
|
|
searchd |
|
{ |
|
# [hostname:]port[:protocol], or /unix/socket/path to listen on |
|
# known protocols are 'sphinx' (SphinxAPI) and 'mysql41' (SphinxQL) |
|
# |
|
# multi-value, multiple listen points are allowed |
|
# optional, defaults are 9312:sphinx and 9306:mysql41, as below |
|
# |
|
# listen = 127.0.0.1 |
|
# listen = 192.168.0.1:9312 |
|
# listen = 9312 |
|
# listen = /var/run/searchd.sock |
|
listen = 9312 |
|
listen = 9306:mysql41 |
|
|
|
# log file, searchd run info is logged here |
|
# optional, default is 'searchd.log' |
|
log = /var/log/sphinxsearch/searchd.log |
|
|
|
# query log file, all search queries are logged here |
|
# optional, default is empty (do not log queries) |
|
query_log = /var/log/sphinxsearch/query.log |
|
|
|
# client read timeout, seconds |
|
# optional, default is 5 |
|
read_timeout = 5 |
|
|
|
# request timeout, seconds |
|
# optional, default is 5 minutes |
|
client_timeout = 300 |
|
|
|
# maximum amount of children to fork (concurrent searches to run) |
|
# optional, default is 0 (unlimited) |
|
max_children = 70 |
|
|
|
# PID file, searchd process ID file name |
|
# mandatory |
|
pid_file = /var/run/sphinxsearch/searchd.pid |
|
|
|
# max amount of matches the daemon ever keeps in RAM, per-index |
|
# WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL |
|
# default is 1000 (just like Google) |
|
max_matches = 1000 |
|
|
|
# seamless rotate, prevents rotate stalls if precaching huge datasets |
|
# optional, default is 1 |
|
seamless_rotate = 1 |
|
|
|
# whether to forcibly preopen all indexes on startup |
|
# optional, default is 1 (preopen everything) |
|
preopen_indexes = 1 |
|
|
|
# whether to unlink .old index copies on succesful rotation. |
|
# optional, default is 1 (do unlink) |
|
unlink_old = 1 |
|
|
|
# attribute updates periodic flush timeout, seconds |
|
# updates will be automatically dumped to disk this frequently |
|
# optional, default is 0 (disable periodic flush) |
|
# |
|
# attr_flush_period = 900 |
|
|
|
|
|
# instance-wide ondisk_dict defaults (per-index value take precedence) |
|
# optional, default is 0 (precache all dictionaries in RAM) |
|
# |
|
# ondisk_dict_default = 1 |
|
|
|
|
|
# MVA updates pool size |
|
# shared between all instances of searchd, disables attr flushes! |
|
# optional, default size is 1M |
|
mva_updates_pool = 5M |
|
|
|
# max allowed network packet size |
|
# limits both query packets from clients, and responses from agents |
|
# optional, default size is 8M |
|
max_packet_size = 8M |
|
|
|
# crash log path |
|
# searchd will (try to) log crashed query to 'crash_log_path.PID' file |
|
# optional, default is empty (do not create crash logs) |
|
# |
|
# crash_log_path = /var/log/sphinxsearch/crash |
|
|
|
|
|
# max allowed per-query filter count |
|
# optional, default is 256 |
|
max_filters = 256 |
|
|
|
# max allowed per-filter values count |
|
# optional, default is 4096 |
|
max_filter_values = 4096 |
|
|
|
|
|
# socket listen queue length |
|
# optional, default is 5 |
|
# |
|
# listen_backlog = 5 |
|
|
|
|
|
# per-keyword read buffer size |
|
# optional, default is 256K |
|
# |
|
# read_buffer = 256K |
|
|
|
|
|
# unhinted read size (currently used when reading hits) |
|
# optional, default is 32K |
|
# |
|
# read_unhinted = 32K |
|
|
|
|
|
# max allowed per-batch query count (aka multi-query count) |
|
# optional, default is 32 |
|
max_batch_queries = 32 |
|
|
|
|
|
# max common subtree document cache size, per-query |
|
# optional, default is 0 (disable subtree optimization) |
|
# |
|
# subtree_docs_cache = 4M |
|
|
|
|
|
# max common subtree hit cache size, per-query |
|
# optional, default is 0 (disable subtree optimization) |
|
# |
|
# subtree_hits_cache = 8M |
|
|
|
|
|
# multi-processing mode (MPM) |
|
# known values are none, fork, prefork, and threads |
|
# optional, default is fork |
|
# |
|
workers = threads # for RT to work |
|
|
|
|
|
# max threads to create for searching local parts of a distributed index |
|
# optional, default is 0, which means disable multi-threaded searching |
|
# should work with all MPMs (ie. does NOT require workers=threads) |
|
# |
|
# dist_threads = 4 |
|
|
|
|
|
# binlog files path; use empty string to disable binlog |
|
# optional, default is build-time configured data directory |
|
# |
|
# binlog_path = # disable logging |
|
# binlog_path = /var/lib/sphinxsearch/data # binlog.001 etc will be created there |
|
|
|
|
|
# binlog flush/sync mode |
|
# 0 means flush and sync every second |
|
# 1 means flush and sync every transaction |
|
# 2 means flush every transaction, sync every second |
|
# optional, default is 2 |
|
# |
|
# binlog_flush = 2 |
|
|
|
|
|
# binlog per-file size limit |
|
# optional, default is 128M, 0 means no limit |
|
# |
|
# binlog_max_log_size = 256M |
|
|
|
|
|
# per-thread stack size, only affects workers=threads mode |
|
# optional, default is 64K |
|
# |
|
# thread_stack = 128K |
|
|
|
|
|
# per-keyword expansion limit (for dict=keywords prefix searches) |
|
# optional, default is 0 (no limit) |
|
# |
|
# expansion_limit = 1000 |
|
|
|
|
|
# RT RAM chunks flush period |
|
# optional, default is 0 (no periodic flush) |
|
# |
|
# rt_flush_period = 900 |
|
|
|
|
|
# query log file format |
|
# optional, known values are plain and sphinxql, default is plain |
|
# |
|
# query_log_format = sphinxql |
|
|
|
|
|
# version string returned to MySQL network protocol clients |
|
# optional, default is empty (use Sphinx version) |
|
# |
|
# mysql_version_string = 5.0.37 |
|
|
|
|
|
# trusted plugin directory |
|
# optional, default is empty (disable UDFs) |
|
# |
|
# plugin_dir = /usr/local/sphinx/lib |
|
|
|
|
|
# default server-wide collation |
|
# optional, default is libc_ci |
|
# |
|
# collation_server = utf8_general_ci |
|
|
|
|
|
# server-wide locale for libc based collations |
|
# optional, default is C |
|
# |
|
# collation_libc_locale = ru_RU.UTF-8 |
|
|
|
|
|
# threaded server watchdog (only used in workers=threads mode) |
|
# optional, values are 0 and 1, default is 1 (watchdog on) |
|
# |
|
# watchdog = 1 |
|
|
|
|
|
# SphinxQL compatibility mode (legacy columns and their names) |
|
# optional, default is 0 (SQL compliant syntax and result sets) |
|
# |
|
# compat_sphinxql_magics = 1 |
|
} |