dustalov · August 26, 2024 21:05 · dustalov · Dec 1, 2012 · mirth · Jan 30, 2013
diff --git a/lexer.rb b/lexer.rb
 # encoding: utf-8

 # Processor of Link Grammar for Russian output.
 #
 class LinkParser::Lexer
  # This exception raises when link grammar is invalid and Lexer
  # is unable to understand the output.
  #
  class InvalidLinkGrammar < RuntimeError
    attr_reader :input

    # @private
    def initialize input
      super 'Invalid link grammar'
      @input = input
    end
  end

  # Abstract syntax tree of the parser output.
  #
  AST = Struct.new(:value)

  # A structure that represents link in Link Grammar.
  # Includes type and position definitions along with word and its
  # morphosyntactic descriptors.
  #
  Link = Struct.new(:type, :subtype, :id, :word, :msd)

  # A structure that represents word in Link Grammar. Includes
  # morphosyntactic descriptors.
  #
  Word = Struct.new(:word, :msd)

  attr_reader :input, :lexer
  private :input, :lexer

  # Create a new {Lexer} instance to process the given parser output.
  #
  # @param input [String] output of the parser.
  #
  def initialize input
    @input = input
  end

  # Perform parsing of the parser output. This wording is silly, but
  # I really can't implement good Link Parser right now.
  #
  # @return [AST] the AST of given parser output.
  #
  def parse
    @lexer = StringScanner.new(input)
    parse_value.value
  ensure
    lexer.eos? or
    raise('Unexpected data: "%s"' % lexer.string[lexer.pos..-1])
  end

  protected
    # Parse any supported syntactic construction of our parser.
    #
    # @return [AST] the AST of given parser output.
    #
    def parse_value
      trim_space!
      parse_list or
      parse_string or
      parse_link or
      raise InvalidLinkGrammar, input
    ensure
      trim_space!
    end

    # List parser.
    #
    # @return [AST] the AST of given parser output.
    #
    def parse_list
      return false unless lexer.scan /\(\s*/

      list = []
      more_values = false

      while contents = parse_value rescue nil
        list << contents.value
        more_values = lexer.scan /\s+/
      end

      raise 'Missing value' if more_values

      lexer.scan /\s*\)\s*/ or raise 'Unclosed list'

      AST.new(list)
    end

    # String parser.
    #
    # @return [AST] the AST of given parser output.
    #
    def parse_string
      return false unless lexer.scan /"/

      string = lexer.scan(/[^\"]+/)
      lexer.scan /"/ or raise 'Undetermined string'

      AST.new(Word.new(*classify_word(string)))
    end

    # Link parser.
    #
    # @return [AST] the AST of given parser output.
    #
    def parse_link
      return false unless token = lexer.scan(/[\wА-Яа-яЁё!:\-\.\,\?]+/)

      complex_type, id, string = token.split(/:/)
      type, subtype = complex_type.match(/([A-Z]+)(.*)/)[1..2]

      AST.new(Link.new(type, subtype, id.to_i, *classify_word(string)))
    end

    # Skip whitespace characters because we are not interested in them.
    #
    def trim_space!
      lexer.scan /\s+/
      self
    end

    # Word classification method that idenfities LEFT-WALL, RIGHT-WALL,
    # punctuation and regular word tokens.
    #
    # @param word [String] the word to classify.
    #
    # @return [Array<[String, Symbol], [String, NilClass]>]
    #   classification data.
    #
    def classify_word(word)
      case word
      when 'LEFT-WALL' then [ :left_wall ]
      when 'RIGHT-WALL' then [ :right_wall ]
      when '.' then [ '.' ]
      else
        if unknown_word = word.match(/^\[(.+)\]$/)
          [ unknown_word[1] ]
        else
          word.split('.', 2).map { |s| !s.empty? ? s : nil }
        end
      end
    end
 end
diff --git a/link_parser.rb b/link_parser.rb
 # encoding: utf-8

 require 'uri'
 require 'net/http'

 require 'nokogiri'

 # An interface to {http://slashzone.ru/parser/ Link Grammar for Russian}.
 #
 module LinkParser
  # Parser URL.
  #
  URL = URI('http://slashzone.ru/parser/parse.pl')

  class << self
    # Analyze one sentence via Web-interface of
    # Link Grammar for Russian.
    #
    # @param sentence [String] the text to analyze.
    #
    # @return [Hash<LinkParser::Word,
    #   Array<LinkParser::Link>>]
    #   the result of analysis.
    #
    def analyze sentence
      links = Lexer.new(request(sentence)).parse

      words = links.flatten.select do |e|
        e.kind_of? LinkParser::Lexer::Word
      end

      unless links.size == words.size
        raise 'Wrong parse results of %d links with %d words' %
          [ links.size, words.size ]
      end

      Hash[
        words.size.times.map do |i|
          [
            words[i],
            links[i].select do |e|
              e.kind_of? LinkParser::Lexer::Link
            end
          ]
        end
      ]
    end

    private
      def request sentence
        sentence_in_cp1251 = sentence.strip
        sentence_in_cp1251.gsub!('Ё', 'Е')
        sentence_in_cp1251.gsub!('ё', 'е')
        sentence_in_cp1251.encode!('CP1251')

        page = Net::HTTP.post_form(URL,
          'Sentence' => sentence_in_cp1251,
          'LinkDisplay' => 'on',
          'ShortLength' => '6',
          'Maintainer' => 'parser_at_svp.zuzino.net.ru',
          'NullLinks' => 'on'
        ).encode! 'UTF-8', 'CP1251'

        Nokogiri::HTML(page).xpath('.//pre').last.text.tap(&:strip!)
      end
  end
 end

 require 'link_parser/lexer'
	# encoding: utf-8

	# Processor of Link Grammar for Russian output.
	#
	class LinkParser::Lexer
	# This exception raises when link grammar is invalid and Lexer
	# is unable to understand the output.
	#
	class InvalidLinkGrammar < RuntimeError
	attr_reader :input

	# @private
	def initialize input
	super 'Invalid link grammar'
	@input = input
	end
	end

	# Abstract syntax tree of the parser output.
	#
	AST = Struct.new(:value)

	# A structure that represents link in Link Grammar.
	# Includes type and position definitions along with word and its
	# morphosyntactic descriptors.
	#
	Link = Struct.new(:type, :subtype, :id, :word, :msd)

	# A structure that represents word in Link Grammar. Includes
	# morphosyntactic descriptors.
	#
	Word = Struct.new(:word, :msd)

	attr_reader :input, :lexer
	private :input, :lexer

	# Create a new {Lexer} instance to process the given parser output.
	#
	# @param input [String] output of the parser.
	#
	def initialize input
	@input = input
	end

	# Perform parsing of the parser output. This wording is silly, but
	# I really can't implement good Link Parser right now.
	#
	# @return [AST] the AST of given parser output.
	#
	def parse
	@lexer = StringScanner.new(input)
	parse_value.value
	ensure
	lexer.eos? or
	raise('Unexpected data: "%s"' % lexer.string[lexer.pos..-1])
	end

	protected
	# Parse any supported syntactic construction of our parser.
	#
	# @return [AST] the AST of given parser output.
	#
	def parse_value
	trim_space!
	parse_list or
	parse_string or
	parse_link or
	raise InvalidLinkGrammar, input
	ensure
	trim_space!
	end

	# List parser.
	#
	# @return [AST] the AST of given parser output.
	#
	def parse_list
	return false unless lexer.scan /\(\s*/

	list = []
	more_values = false

	while contents = parse_value rescue nil
	list << contents.value
	more_values = lexer.scan /\s+/
	end

	raise 'Missing value' if more_values

	lexer.scan /\s\)\s/ or raise 'Unclosed list'

	AST.new(list)
	end

	# String parser.
	#
	# @return [AST] the AST of given parser output.
	#
	def parse_string
	return false unless lexer.scan /"/

	string = lexer.scan(/[^\"]+/)
	lexer.scan /"/ or raise 'Undetermined string'

	AST.new(Word.new(*classify_word(string)))
	end

	# Link parser.
	#
	# @return [AST] the AST of given parser output.
	#
	def parse_link
	return false unless token = lexer.scan(/[\wА-Яа-яЁё!:\-\.\,\?]+/)

	complex_type, id, string = token.split(/:/)
	type, subtype = complex_type.match(/([A-Z]+)(.*)/)[1..2]

	AST.new(Link.new(type, subtype, id.to_i, *classify_word(string)))
	end

	# Skip whitespace characters because we are not interested in them.
	#
	def trim_space!
	lexer.scan /\s+/
	self
	end

	# Word classification method that idenfities LEFT-WALL, RIGHT-WALL,
	# punctuation and regular word tokens.
	#
	# @param word [String] the word to classify.
	#
	# @return [Array<[String, Symbol], [String, NilClass]>]
	# classification data.
	#
	def classify_word(word)
	case word
	when 'LEFT-WALL' then [ :left_wall ]
	when 'RIGHT-WALL' then [ :right_wall ]
	when '.' then [ '.' ]
	else
	if unknown_word = word.match(/^\[(.+)\]$/)
	[ unknown_word[1] ]
	else
	word.split('.', 2).map { \|s\| !s.empty? ? s : nil }
	end
	end
	end
	end
	# encoding: utf-8

	require 'uri'
	require 'net/http'

	require 'nokogiri'

	# An interface to {http://slashzone.ru/parser/ Link Grammar for Russian}.
	#
	module LinkParser
	# Parser URL.
	#
	URL = URI('http://slashzone.ru/parser/parse.pl')

	class << self
	# Analyze one sentence via Web-interface of
	# Link Grammar for Russian.
	#
	# @param sentence [String] the text to analyze.
	#
	# @return [Hash<LinkParser::Word,
	# Array<LinkParser::Link>>]
	# the result of analysis.
	#
	def analyze sentence
	links = Lexer.new(request(sentence)).parse

	words = links.flatten.select do \|e\|
	e.kind_of? LinkParser::Lexer::Word
	end

	unless links.size == words.size
	raise 'Wrong parse results of %d links with %d words' %
	[ links.size, words.size ]
	end

	Hash[
	words.size.times.map do \|i\|
	[
	words[i],
	links[i].select do \|e\|
	e.kind_of? LinkParser::Lexer::Link
	end
	]
	end
	]
	end

	private
	def request sentence
	sentence_in_cp1251 = sentence.strip
	sentence_in_cp1251.gsub!('Ё', 'Е')
	sentence_in_cp1251.gsub!('ё', 'е')
	sentence_in_cp1251.encode!('CP1251')

	page = Net::HTTP.post_form(URL,
	'Sentence' => sentence_in_cp1251,
	'LinkDisplay' => 'on',
	'ShortLength' => '6',
	'Maintainer' => 'parser_at_svp.zuzino.net.ru',
	'NullLinks' => 'on'
	).encode! 'UTF-8', 'CP1251'

	Nokogiri::HTML(page).xpath('.//pre').last.text.tap(&:strip!)
	end
	end
	end

	require 'link_parser/lexer'