Last active
August 26, 2024 21:05
-
-
Save dustalov/2021295 to your computer and use it in GitHub Desktop.
Link Grammar for Russian (Parser of the Parser)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
# Processor of Link Grammar for Russian output. | |
# | |
class LinkParser::Lexer | |
# This exception raises when link grammar is invalid and Lexer | |
# is unable to understand the output. | |
# | |
class InvalidLinkGrammar < RuntimeError | |
attr_reader :input | |
# @private | |
def initialize input | |
super 'Invalid link grammar' | |
@input = input | |
end | |
end | |
# Abstract syntax tree of the parser output. | |
# | |
AST = Struct.new(:value) | |
# A structure that represents link in Link Grammar. | |
# Includes type and position definitions along with word and its | |
# morphosyntactic descriptors. | |
# | |
Link = Struct.new(:type, :subtype, :id, :word, :msd) | |
# A structure that represents word in Link Grammar. Includes | |
# morphosyntactic descriptors. | |
# | |
Word = Struct.new(:word, :msd) | |
attr_reader :input, :lexer | |
private :input, :lexer | |
# Create a new {Lexer} instance to process the given parser output. | |
# | |
# @param input [String] output of the parser. | |
# | |
def initialize input | |
@input = input | |
end | |
# Perform parsing of the parser output. This wording is silly, but | |
# I really can't implement good Link Parser right now. | |
# | |
# @return [AST] the AST of given parser output. | |
# | |
def parse | |
@lexer = StringScanner.new(input) | |
parse_value.value | |
ensure | |
lexer.eos? or | |
raise('Unexpected data: "%s"' % lexer.string[lexer.pos..-1]) | |
end | |
protected | |
# Parse any supported syntactic construction of our parser. | |
# | |
# @return [AST] the AST of given parser output. | |
# | |
def parse_value | |
trim_space! | |
parse_list or | |
parse_string or | |
parse_link or | |
raise InvalidLinkGrammar, input | |
ensure | |
trim_space! | |
end | |
# List parser. | |
# | |
# @return [AST] the AST of given parser output. | |
# | |
def parse_list | |
return false unless lexer.scan /\(\s*/ | |
list = [] | |
more_values = false | |
while contents = parse_value rescue nil | |
list << contents.value | |
more_values = lexer.scan /\s+/ | |
end | |
raise 'Missing value' if more_values | |
lexer.scan /\s*\)\s*/ or raise 'Unclosed list' | |
AST.new(list) | |
end | |
# String parser. | |
# | |
# @return [AST] the AST of given parser output. | |
# | |
def parse_string | |
return false unless lexer.scan /"/ | |
string = lexer.scan(/[^\"]+/) | |
lexer.scan /"/ or raise 'Undetermined string' | |
AST.new(Word.new(*classify_word(string))) | |
end | |
# Link parser. | |
# | |
# @return [AST] the AST of given parser output. | |
# | |
def parse_link | |
return false unless token = lexer.scan(/[\wА-Яа-яЁё!:\-\.\,\?]+/) | |
complex_type, id, string = token.split(/:/) | |
type, subtype = complex_type.match(/([A-Z]+)(.*)/)[1..2] | |
AST.new(Link.new(type, subtype, id.to_i, *classify_word(string))) | |
end | |
# Skip whitespace characters because we are not interested in them. | |
# | |
def trim_space! | |
lexer.scan /\s+/ | |
self | |
end | |
# Word classification method that idenfities LEFT-WALL, RIGHT-WALL, | |
# punctuation and regular word tokens. | |
# | |
# @param word [String] the word to classify. | |
# | |
# @return [Array<[String, Symbol], [String, NilClass]>] | |
# classification data. | |
# | |
def classify_word(word) | |
case word | |
when 'LEFT-WALL' then [ :left_wall ] | |
when 'RIGHT-WALL' then [ :right_wall ] | |
when '.' then [ '.' ] | |
else | |
if unknown_word = word.match(/^\[(.+)\]$/) | |
[ unknown_word[1] ] | |
else | |
word.split('.', 2).map { |s| !s.empty? ? s : nil } | |
end | |
end | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
require 'uri' | |
require 'net/http' | |
require 'nokogiri' | |
# An interface to {http://slashzone.ru/parser/ Link Grammar for Russian}. | |
# | |
module LinkParser | |
# Parser URL. | |
# | |
URL = URI('http://slashzone.ru/parser/parse.pl') | |
class << self | |
# Analyze one sentence via Web-interface of | |
# Link Grammar for Russian. | |
# | |
# @param sentence [String] the text to analyze. | |
# | |
# @return [Hash<LinkParser::Word, | |
# Array<LinkParser::Link>>] | |
# the result of analysis. | |
# | |
def analyze sentence | |
links = Lexer.new(request(sentence)).parse | |
words = links.flatten.select do |e| | |
e.kind_of? LinkParser::Lexer::Word | |
end | |
unless links.size == words.size | |
raise 'Wrong parse results of %d links with %d words' % | |
[ links.size, words.size ] | |
end | |
Hash[ | |
words.size.times.map do |i| | |
[ | |
words[i], | |
links[i].select do |e| | |
e.kind_of? LinkParser::Lexer::Link | |
end | |
] | |
end | |
] | |
end | |
private | |
def request sentence | |
sentence_in_cp1251 = sentence.strip | |
sentence_in_cp1251.gsub!('Ё', 'Е') | |
sentence_in_cp1251.gsub!('ё', 'е') | |
sentence_in_cp1251.encode!('CP1251') | |
page = Net::HTTP.post_form(URL, | |
'Sentence' => sentence_in_cp1251, | |
'LinkDisplay' => 'on', | |
'ShortLength' => '6', | |
'Maintainer' => 'parser_at_svp.zuzino.net.ru', | |
'NullLinks' => 'on' | |
).encode! 'UTF-8', 'CP1251' | |
Nokogiri::HTML(page).xpath('.//pre').last.text.tap(&:strip!) | |
end | |
end | |
end | |
require 'link_parser/lexer' |
В link_parser.rb:62, видимо, закралась опечатка.
Я так невнимателен! Спасибо, исправил.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Однако этот анализатор такие вещами не занимается и делает как знает :)