Skip to content

Instantly share code, notes, and snippets.

@Evshved
Created July 21, 2018 17:49
Show Gist options
  • Save Evshved/2914c0f9d1fec8372c2ec1d378bd68b2 to your computer and use it in GitHub Desktop.
Save Evshved/2914c0f9d1fec8372c2ec1d378bd68b2 to your computer and use it in GitHub Desktop.
require "addressable/uri"
require "yaml"
class Site < ActiveRecord::Base
include Utils
include BadEntity
FEATURED_AT_POPULARITY = 0.5
has_many :articles_topics
has_one :site_rating
has_many :search_results
has_many :startups, :foreign_key => :newsmaker_id, :class_name => "SearchResult"
has_many :source_urls, :through => :search_results
has_one :url_status
has_many :sites_tags
has_many :sites_users
has_many :users, :through => :sites_users
has_one :press_room
has_one :site_profile, :foreign_key => :site_id
accepts_nested_attributes_for :site_profile, :update_only => true
has_many :sites_categories
has_many :categories, :through => :sites_categories
has_one :companies_site
has_many :site_reviews
has_one :site_meta_parser
has_one :low_quality_site
scope :startup, where("site_type = 0")
scope :newsmaker, where("site_type = 1")
scope :visible, where("visibility_type<>2")
validates :domain_name, :presence => true, :allow_blank => false
after_initialize :generate_hashes
after_initialize :default_values
after_save :update_tags
before_destroy :destroy_dependences
after_update :clear_cache
@@search_results_limit = 100
@@tags_limit = 5
@@tags_ttl = PressAbout::Application.config.site_tags_ttl
@@site_ttl = PressAbout::Application.config.sites_ttl
@@import_slice_count = 10
@@redis = PressAbout::Application.config.redis_server
@@sites_parsing_queue_key = "press_about:production:sites_parsing_queue"
cattr_reader :redis, :sites_parsing_queue_key, :tags_limit
attr_accessor :saved_site_profile, :saved_site_rating
attr_reader :new_tags
define_index "site_domain_name" do
indexes domain_name, :as => :domain_name
has created_at
has id, :as => :site_id
has "case is_english when -1 then 1 when 0 then 0 when 1 then 2 else 1 end", :as => :is_english, :type => :integer, :sort => true
has "case bad_rank when bad_rank < 0 then 0 when bad_rank > 0 then 2 else 1 end", :as => :bad_rank, :type => :integer, :sort => true
set_property :enable_star => 1
set_property :min_infix_len => 3
end
define_index "site_title" do
indexes title, :as => :title
has created_at
has id, :as => :site_id
has "case is_english when -1 then 1 when 0 then 0 when 1 then 2 else 1 end", :as => :is_english, :type => :integer, :sort => true
has "case bad_rank when bad_rank < 0 then 0 when bad_rank > 0 then 2 else 1 end", :as => :bad_rank, :type => :integer, :sort => true
where "title <> ''"
end
define_index "site_description" do
indexes description, :as => :description
has "case is_english when -1 then 1 when 0 then 0 when 1 then 2 else 1 end", :as => :is_english, :type => :integer, :sort => true
has "case bad_rank when bad_rank < 0 then 0 when bad_rank > 0 then 2 else 1 end", :as => :bad_rank, :type => :integer, :sort => true
has created_at
has id, :as => :site_id
where "title <> ''"
end
@@site_types = {
:startup => 0,
:newsmaker => 1
}
@@parsing_statuses = {
:new => 0,
:parsed => 1,
:error => 2,
:empty => 3,
:processing => 4,
:not_found => 10
}
@@visibility_types = {
:default => 0, # обычный проект
:press_room => 1, # Верифицированный проект, показываем прессрум
:hidden => 2 # скрытый проект
}
before_validation(:on => :create) do
if attribute_present?("domain_name")
domain_name = self.class.domain_from_url(self.domain_name)
if domain_name.blank?
errors.add(:domain_name, "not valid")
return false
else
self.domain_name = domain_name
end
self.hash_domain = self.class.hash_from(domain_name)
unless self.newsmaker?
unless domain_exists?(domain_name, 15)
errors.add(:domain_name, "not found")
return false
else
answered_domain = answered_domain_name(domain_name)
if answered_domain.blank?
errors.add(:domain_name, "not answered")
return false
end
if answered_domain != domain_name
errors.add(:domain_name, "not found. Maybe '#{answered_domain}'?")
return false
end
end
return true
end
end
end
validates_each :new_tags do |record, attr, value|
if value
record.errors.add(:tags, "you can input only #{Site.tags_limit} tags") if value.size > Site.tags_limit
record.errors.add(:tags, "tag size should be 3..50 symbols") if value.find { |s| !(3..50).include?(s.size) }
record.errors.add(:tags, "tag should be not more than 3 words and contain only letters and numbers") if value.find { |s| !Tag.tag_is_valid?(s) }
end
end
def tags
unless @tags
site_tags = self.sites_tags.map { |s| s.tag_id }.to_a
@tags = site_tags.blank? ? [] : Tag.where(:id => site_tags).to_a
end
@tags
end
def tags=(value)
@new_tags = Tag.convert_string_to_tags(value.to_s, false)
end
def update_tags
if @new_tags
old_tags = self.tags.map { |t| t.value }
if @new_tags.sort != old_tags.sort
deleted_tags = old_tags - @new_tags
new_tags = @new_tags - old_tags
SitesTag.delete_all(["site_id = ? and tag_id in (?)", self.id, self.tags.select { |s| deleted_tags.include?(s.value) }]) unless deleted_tags.blank?
self.add_tags(new_tags)
end
end
end
def clear_cache
logger.debug('---clear_cache')
self.clear_page_cache('startup')
self.clear_page_cache('newsmaker')
self.class.delete_site_from_cache(self.id)
self.class.delete_limited_tags_from_cache(self.id)
end
def destroy_dependences
# destroy startup dependence
self.site_profile.destroy if self.site_profile
SiteRating.delete_all(["site_id =?", self.id])
SourceUrl.update_all("check_on_delete = 1", ["id in (?)", self.search_results.map { |source| source.source_url_id }])
# destroy newsmaker dependence
SourceUrl.delete_all(["newsmaker_id = ?", self.id])
SearchResult.delete_all(["site_id = ?", self.id])
SearchResult.delete_all(["newsmaker_id = ?", self.id])
# destroy tags dependences
SitesTag.delete_all(["site_id = ?", self.id])
PressRoom.delete_all(["site_id = ?", self.id])
PressRelease.delete_all(["press_room_id = ?", self.id])
SiteReview.delete_all(["site_id = ?", self.id])
SitesUser.delete_all(["site_id = ?", self.id])
LowQualitySite.delete_all(["site_id = ?", self.id])
clear_cache
end
def generate_hashes
if attribute_present?("domain_name")
self.hash_domain = self.class.hash_from(self.domain_name)
end
end
def domain_url
"http://#{self.domain_name}"
end
def domain_name(force=false)
if @domain_name.nil? or force==true
@domain_name = read_attribute(:domain_name).to_s.downcase
@domain_name = @domain_name[0..-2] if @domain_name.last =~ /\./
end
@domain_name
end
def title_safe
unless @title_safe
@title_safe = title.blank? ? domain_name : sanitize_string_for_view(title).capitalize_first
end
@title_safe
end
def description_safe
unless @description_safe
@description_safe = description.blank? ? description : sanitize_string_for_view(description[0..1000])[0..500].capitalize_first
end
@description_safe
end
def similar_sources(limit=2)
similar_string = self.domain_name.split('.')[0..-2].join('.')
similars = self.source_urls.each { |source| source.similarity=similar_string.similar(source.title) }
similars = similars.sort_by { |s| -s.similarity }
end
#def company
# self.press_room ? self.press_room.company : nil
#end
# Keywords
def calculated_keywords
self.class.calculated_keywords(self)
end
# Visibility type
def set_visibility_type(type)
value = @@visibility_types[type.to_sym]
if value
self.update_attribute(:visibility_type, value) unless self.visibility_type == value
else
errors.add(:visibility_type, "Visibility type not available")
return false
end
self
end
def get_visibility_type
@@visibility_types.key(self.visibility_type)
end
def visibility_hidden?
self.visibility_type == self.class.visibility_type_by_name(:hidden)
end
def visibility_press_room?
self.visibility_type == self.class.visibility_type_by_name(:press_room)
end
# Parsing Status
def set_parsing_status(status)
value = @@parsing_statuses[status.to_sym]
if value
self.update_attribute(:status, value) unless self.status == value
else
errors.add(:status, "Status not available")
return false
end
self
end
def get_parsing_status
@@parsing_statuses.key(self.status)
end
def set_as_parsed!(status=:parsed)
status_value = @@parsing_statuses[status.to_sym]
if status_value
self.class.update_all({:status => status_value, :last_parsed_at => Time.now}, ["id = ?", self.id])
self.clear_cache
end
end
# Parsing permissions
def last_parsed
self.last_parsed_at.blank? ? 'never' : self.last_parsed_at.strftime("%b. %d, %Y")
end
def allow_parsing?
self.last_parsed_at.blank? ? true : (self.last_parsed_at < (Date.today - 1.day))
end
def next_parsing_time
self.last_parsed_at.blank? ? Date.today : self.last_parsed_at + 1.day
end
# Favicon
def favicon
self.class.favicon_for_domain(self.domain_name)
end
# Last access
def check_last_get_access
if self.custom? and (self.last_get_access.blank? or self.last_get_access < Date.today)
self.update_attribute(:last_get_access, Date.today)
end
end
# Owner type
def custom?
self.owner_type == 1
end
def set_custom
self.owner_type = 1 unless self.custom?
end
def set_custom!
self.update_attribute(:owner_type, 1) unless self.custom?
end
# Ratings
#def rating
# if self.saved_site_rating
# self.saved_site_rating.rating
# elsif self.rating
# self.site_rating.rating
# end
#end
def rating
unless @rating
if self.site_rating and self.site_rating.rating>=0
@rating = self.site_rating.rating
else
@rating = self.reset_rating
end
end
@rating
end
def reset_rating
rating = SiteRating.site_rating(self)
SiteRating.delete_all(:site_id => self.id)
SiteRating.create(:rating => rating, :site_id => self.id)
rating
end
# Site type
def get_site_type
@@site_types.key(self.site_type)
end
def newsmaker?
self.site_type == self.class.site_type_by_name(:newsmaker)
end
def startup?
self.site_type == self.class.site_type_by_name(:startup)
end
def set_as_newsmaker
self.site_type = self.class.site_type_by_name(:newsmaker)
end
def set_as_startup
self.site_type = self.class.site_type_by_name(:startup)
end
def set_as_startup!
self.update_attribute(:site_type, self.class.site_type_by_name(:startup)) unless startup?
end
def update_bad_rank!(force=false)
if self.bad_rank == 0 or force==true
bad_rank = text_is_bad?(self.bad_entity_sample) ? -1 : 1
self.update_attribute(:bad_rank, bad_rank)
SearchResult.update_all("site_bad_rank = #{bad_rank}", ["site_id = ?", self.id])
SitesTag.update_all("site_bad_rank = #{bad_rank}", ["site_id = ?", self.id])
end
end
# Languages
def is_english?
self.is_english == 1
end
def update_language!(force=false)
self.update_attribute(:is_english, text_is_english?([self.title, self.description].join)) if self.is_english<0 or force==true
end
def set_language
self.is_english = text_is_english?([self.title, self.description].join) if !self.title.blank? or !self.description.blank?
end
# Cache
def page_cache_key(type='startup')
"page_#{type}_#{self.id}"
end
def clear_page_cache(type='startup')
ActionController::Base.new.expire_fragment(page_cache_key(type))
end
# Tags
def add_tags(tags=[])
@limited_tags = nil
self.class.add_tags_to_site(self, tags)
end
def correct_limited_tags(limit=5)
unless @correct_tags
site_tags = SitesTag.where(:site_id => self.id).limit(@@tags_limit*2).select('tag_id').to_a
@correct_tags = Tag.where(:id => site_tags.map { |s| s.tag_id }).select { |tag| Tag.tag_is_valid?(tag.value) }.sort_by { |tag| -tag.is_english }.map { |tag| tag.value }.to_a
@correct_tags = @correct_tags.first(limit)
end
@correct_tags
end
def limited_tags(limit=3)
return @limited_tags unless @limited_tags.nil?
@limited_tags = self.id.blank? ? [] : self.class.limited_tags_by_site(self.id, limit)
end
# bad words
def bad_entity_sample
"#{self.domain_name} #{self.title_safe} #{self.description_safe} #{self.limited_tags.join(" ")}"
end
def is_good?
self.bad_rank > 0
end
# logo
def logo
if self.saved_site_profile
self.saved_site_profile.logo
elsif self.site_profile
self.site_profile.logo
end
end
def have_logo?(force=false)
(self.saved_site_profile and self.saved_site_profile.logo_file_name.present?) or
(self.site_profile(force) and self.site_profile.logo_file_name.present?)
end
def verified?
@verified = self.sites_users.verified.size > 0 if @verified.nil?
@verified
end
# Custom domain
def custom_domain
self.site_profile.present? ? self.site_profile.custom_domain : nil
end
# ---------------
def to_param
self.domain_name
end
def exclude_duplicate_results(sources=[])
# Исключить ссылки на этот же домен
results = sources.reject { |source| self.class.domain_from_url(source[:url]) == self.domain_name }
duplicates = []
# Посчитать хэши для текущего списка
results = results.each { |source| source[:hash_url] = self.class.hash_from(source[:url]) }
# Получить список хешей старых ссылок
source_url_ids = SearchResult.where(:site_id => self.id).select("source_url_id").to_a
unless source_url_ids.blank?
old_source_urls = SourceUrl.where(:id => source_url_ids.map { |s| s.source_url_id }).select("id, hash_url")
old_source_urls_hashes = old_source_urls.map { |source| source.hash_url }
# Найти совпадения
# Удалить совпадающие урлы из списка
duplicates, results = results.partition { |source| old_source_urls_hashes.include?(source[:hash_url]) }
# Обновить дату у совпадений
duplicates_hashes = duplicates.map { |source| source[:hash_url] }
old_source_urls.reject! { |source| !duplicates_hashes.include?(source.hash_url) }
SearchResult.update_all('last_reviewed_at = CURDATE()', ["site_id = ? and source_url_id in(?)", self.id, old_source_urls.map { |source| source.id }])
end
{:new => results, :old => duplicates}
end
def update_source_urls(sources=[], params={})
result = {:updated => 0}
return result if sources.blank?
updated_sources = SourceUrl.initialize_source_urls(sources)
source_urls = SourceUrl.where(:hash_url => updated_sources.map { |t| t.hash_url }).select("id, title, description, url, hash_url, is_english, updated_at").to_a
search_results = self.search_results.where(:source_url_id => source_urls.map { |t| t.id }).to_a
source_urls.each do |source_url|
begin
source = updated_sources.find { |t| t.hash_url == source_url.hash_url }
search_result = search_results.find { |t| t.source_url_id == source_url.id }
if source
attr = {}
attr[:title] = source.title if source.title.present? and source.title!=source_url.title
attr[:description] = source.description if source.description.present? and source.description!=source_url.description
is_english = SourceUrl.whats_language?(source)
attr[:is_english] = is_english if is_english !=source_url.is_english
SourceUrl.update_all(attr, ["id=?", source_url.id]) unless attr.blank?
end
if search_result
weight_params = SearchResult.get_weight_params(self, source_url)
weight_params = weight_params.merge(:weight => SearchResult.calculate_weight_by_params(weight_params))
SearchResult.update_all(weight_params, ["site_id=? and source_url_id=?", self.id, search_result.source_url_id])
end
result[:updated] += 1
rescue => e
logger.debug("--update_source_urls error #{e}")
end
end
result
end
def add_source_urls(sources=[], params={})
result = {:added => 0, :deleted => 0}
return result if sources.blank?
new_sources = SourceUrl.initialize_source_urls(sources)
old_sources = self.load_search_results(:with => [:source_urls, :newsmakers])
# Cформировать список всех статей (url, hash_url, weight, created_at, hash_domain)
results = []
deleted_sources = []
new_sources.each do |source|
results << {
:hash_url => source.hash_url,
:weight => SearchResult.calculate_weight(self, source),
:weight_factor => SearchResult.weight_factor(Date.today),
:hash_domain => source.hash_domain
}
end
old_sources.each do |source|
if source.newsmaker and source.source_url
results << {:hash_url => source.source_url.hash_url, :weight => source.weight, :weight_factor => SearchResult.weight_factor(source.created_at.nil? ? Date.today : source.created_at),
:hash_domain => source.newsmaker.hash_domain}
else
deleted_sources.push(source)
end
end
# Отсортировать по весу и коэффициенту
results = results.sort_by { |source| -source[:weight]*source[:weight_factor] }
# Уникализировать по hash_domain
results = results.uniq { |source| source[:hash_domain] }
# Убрать все что ниже лимита
results = results[0..@@search_results_limit-1]
# Список на сохранение данных
results_hashes = results.map { |result| result[:hash_url] }
new_sources = new_sources.reject { |source| !results_hashes.include?(source.hash_url) }
# Список на удаление данных
deleted_sources += old_sources.reject { |source| source.source_url.present? and results_hashes.include?(source.source_url.hash_url) }
# Добавляем в БД из new(newsmakers -> source_urls -> search_results)
new_source_urls=[]
newsmakers = self.class.create_newsmakers(new_sources)
new_sources.each do |new_source_url|
begin
newsmaker = newsmakers.find { |newsmaker| newsmaker.hash_domain==new_source_url.hash_domain }
if newsmaker
new_source_url.newsmaker_id = newsmaker.id
else
newsmaker = Site.create(:domain_name => new_source_url.domain_name, :site_type => Site.site_type_by_name(:newsmaker))
new_source_url.newsmaker_id = newsmaker.id if newsmaker.persisted?
end
new_source_urls << new_source_url
rescue => e
logger.error("Site.add_source_urls #{e.class}: #{e.message} | url = #{new_source_url.url}")
end
end
SourceUrl.import(new_source_urls, :ignore => true, :validate => false, :synchronize => new_source_urls, :synchronize_keys => [:hash_url])
result[:added]=new_source_urls.size
Erasers::SourceUrlsEraser.remove_from_queue(new_source_urls.map { |source| source.id }, false)
if params[:update_statuses]==true
new_source_urls.each do |source_url|
source_url.update_status(source_url.status) if source_url.status
end
end
new_search_results=[]
new_source_urls.each do |source_url|
new_search_results << SearchResult.initialize_by_site_and_source_url(self, source_url)
end
SearchResult.import(new_search_results, :validate => false)
# Удаляем из БД то, что в deleted
unless deleted_sources.empty?
result[:deleted]=deleted_sources.size
SearchResult.delete_all(["site_id = ? and source_url_id in (?)", self.id, deleted_sources.map { |source| source.source_url_id }])
# Помечаем в проверку на удаление source_urls, те которые были в deleted
#SourceUrl.update_all("check_on_delete = 1", ["id in (?)", deleted_sources.map { |source| source.source_url_id }])
Erasers::SourceUrlsEraser.add_to_queue(deleted_sources.map { |source| source.source_url_id })
end
result
end
def load_search_results(options={})
search_results = self.search_results
if options[:with]
source_urls = []
newsmakers = []
if options[:with].include?(:source_urls)
source_urls = SourceUrl.where(:id => search_results.map { |s| s.source_url_id }).select("id, hash_url, newsmaker_id")
end
if options[:with].include?(:newsmakers)
newsmakers = Site.where(:id => search_results.map { |s| s.newsmaker_id }).select("id, hash_domain")
end
search_results.each do |search_result|
search_result.source_url = source_urls.find { |s| s.id == search_result.source_url_id }
search_result.newsmaker = newsmakers.find { |s| s.id == search_result.newsmaker_id }
end
end
#self.search_results.includes(:source_url, :newsmaker)
search_results
end
def add_to_searching(params={})
uuid = "site_#{md5(self.domain_name)}"
case params[:priority]
when 'googlebot'
self.class.redis.rpush(self.class.sites_parsing_queue_key, self.domain_name)
when 'user'
UserSearchParsingWorker.create(:domain_name => self.domain_name, :id => self.id, :uuid => "#{uuid}_user")
when 'high'
HighSearchParsingWorker.create(:domain_name => self.domain_name, :id => self.id, :uuid => "#{uuid}_high")
else
SearchParsingWorker.create(:domain_name => self.domain_name, :id => self.id, :uuid => uuid)
end
end
def set_url_status(status="")
unless status.blank?
unless self.url_status
self.create_url_status(:status => status)
else
self.url_status.update_attributes(:status => status)
end
else
self.url_status.destroy if self.url_status
end
end
def as_json(options={})
site = super
site = site.merge(:site_profile => self.site_profile.as_json) if self.site_profile
site
end
# API
def json_profile
{
:domain_name => self.domain_name,
:title => self.title_safe.to_s,
:description => self.description_safe.to_s
}
end
# Topics
def latest_topics(limit=5, max_limit=50)
unless @latest_topics
@latest_topics = []
articles_topics = ArticlesTopic.select("topic_id").where(:site_id => self.id).order("article_id DESC").limit(max_limit).to_a
if articles_topics.present?
articles_topics = articles_topics.map { |a| a.topic_id }.uniq.first(limit)
topics = Topic.select("id, value").where(:id => articles_topics).to_a
articles_topics.each { |a| @latest_topics.push(topics.find { |t| t.id == a }) }
@latest_topics.compact!
end
end
@latest_topics
end
# Featured at
def featured_at(limit=6)
results = []
search_results = self.search_results.select("newsmaker_id, rating, logo_file_name").
joins("left join site_ratings on newsmaker_id = site_ratings.site_id").
joins("left join site_profiles on newsmaker_id = site_profiles.site_id").to_a
search_results.select! { |s| s.rating.present? and s.logo_file_name.present? and s.rating >= FEATURED_AT_POPULARITY }
search_results.sort! { |a, b| b.rating<=>a.rating }
search_results = search_results.first(limit)
if search_results.present?
sites = Site.where(:id => search_results.map(&:newsmaker_id)).to_a
search_results.each do |search_result|
site = sites.find { |s| s.id==search_result.newsmaker_id }
unless site.blank?
site.saved_site_profile = SiteProfile.new(:logo_file_name => search_result.logo_file_name)
site.saved_site_profile.site_id = search_result.newsmaker_id
results << site
end
end
end
results
end
private
def prepare_source_urls(sources=[])
sources.each do |source|
source[:hash_url] = self.class.hash_from(source[:url])
end
end
def default_values
unless self.persisted?
self.title ||= ''
self.description ||= ''
self.last_parsed_at ||= false
self.related_parsed_at ||= false
self.last_get_access ||= false
self.created_at ||= false
self.updated_at ||= false
self.set_language
end
rescue ActiveModel::MissingAttributeError
end
class << self
include Utils
def default_domain_name
"stickr.com"
end
# Favicon
def favicon_for_domain(domain_name)
#"http://www.google.com/s2/favicons?domain_url=http://#{domain_name}"
#"http://faviget.appspot.com/http://#{domain_name}"
"http://www.google.com/s2/favicons?domain_url=http://#{domain_name}"
end
# Tags
def add_tags_to_site(site, tags=[])
return if tags.empty?
# создаем новые теги
tags = Tag.create_tags(tags)
# создаем связи с сайтом
new_site_tags = []
tags.each do |tag|
new_site_tags << SitesTag.new(:site_id => site.id, :tag_id => tag.id)
end
SitesTag.import(new_site_tags, :validate => false, :ignore => true, :slice_by => @@import_slice_count)
# Ставим флаг что сайту добавлены теги
site
end
def tags_cache_key(site_id)
"site_tags:#{site_id}"
end
def delete_limited_tags_from_cache(site_id)
Rails.cache.delete(tags_cache_key(site_id))
end
def limited_tags_by_site(site_id, limit=3)
tags = Rails.cache.fetch(tags_cache_key(site_id), :expires_in => @@tags_ttl) do
site_tags = SitesTag.where(:site_id => site_id).limit(@@tags_limit).select('tag_id').to_a
site_tags.blank? ? [] : Tag.where(:id => site_tags.map { |s| s.tag_id }).sort_by { |tag| -tag.is_english }.map { |tag| tag.value }.to_a
end
tags.blank? ? [] : tags.sort.first(limit)
end
# Cache
def site_cache_key(site_id)
"site:#{site_id}"
end
def delete_site_from_cache(site_id)
Rails.cache.delete(site_cache_key(site_id))
end
def read_site_from_cache(site_id)
Rails.cache.read(site_cache_key(site_id), :raw => true)
end
def write_site_to_cache(site)
begin
raise "site blank" unless site
dump = dump(site)
Rails.cache.write(site_cache_key(site.id), dump, :expires_in => @@site_ttl, :raw => true)
rescue => e
logger.debug("--#{e}")
end
end
def load_site_by_id(site_id)
site = read_site_from_cache(site_id)
unless site
site = find(site_id)
write_site_to_cache(site)
end
site
end
def dump(site)
dump = ActiveSupport::JSON.encode(site)
dump
end
def load(dump)
decode = ActiveSupport::JSON.decode(dump)
site = new(decode.except('id', 'site_profile'))
site.id= decode['id']
if decode['site_profile'].present?
site.saved_site_profile = SiteProfile.new(decode['site_profile'].except('site_id'))
site.saved_site_profile.site_id =decode['site_profile']['site_id']
end
site
end
def load_sites_by_id(site_ids=[], allow_caching=true, params={})
ids = site_ids.is_a?(Array) ? site_ids : [site_ids]
sites = []
time_interval("---load_sites_by_id") do
dumps = []
if allow_caching == true
begin
ids.each { |id| dumps.push(read_site_from_cache(id)) }
dumps.each do |dump|
sites.push(load(dump)) if dump
end
rescue => e
logger.debug(e)
ids.each { |id| delete_site_from_cache(id) }
end
sites = sites.compact
end
cached_ids = sites.map { |site| site['id'] }
not_cached_ids = ids.reject { |id| cached_ids.include?(id) }
unless not_cached_ids.empty?
not_cached_sites = where("id in (?)", not_cached_ids).includes(:site_profile).to_a
not_cached_sites.each { |site| write_site_to_cache(site) } if allow_caching == true
sites += not_cached_sites
sites.each { |s| s.limited_tags }
end
end
sites
end
# Language
def is_english?(site)
site['is_english'] == 1
end
def whats_language?(site)
if !site['title'].blank? or !site['description'].blank?
text_is_english?([site['title'], site['description']].join) ? 1 : 0
else
-1
end
end
# Keywords
def calculated_keywords(site)
search_words_composition(site['domain_name'], index_keywords_from_text([site['title'], site['description']].join(' ')))
end
# Sites types
def site_type_by_name(key)
@@site_types[key.to_sym]
end
# Visibility type
def visibility_types
@@visibility_types
end
def list_visibility_types
@@visibility_types.except(:default)
end
def visibility_type_by_name(name)
@@visibility_types[name.to_sym]
end
# Parsing status
def parsing_status_by_name(status_name)
@@parsing_statuses[status_name.to_sym]
end
def hash_from(source)
Digest::MD5.hexdigest(source.strip) unless source.blank?
end
def add_group_sites_to_parsing(ids=[])
unless ids.blank?
uuid = "meta_parsing_#{hash_from(ids.join(""))}"
SitesMetaParsingWorker.create(:ids => ids, :uuid => uuid)
end
end
def add_sites_to_searching
counter = redis.llen(sites_parsing_queue_key).to_i
(0..counter).each do
domain_name = redis.lpop sites_parsing_queue_key
unless domain_name.blank?
site = Site.new(:domain_name => domain_name)
site.add_to_searching(:priority => 'high')
end
end
counter
end
def create_newsmakers(newsmakers=[], params={})
newsmakers_hash_domains = newsmakers.map { |newsmaker| hash_from(newsmaker.respond_to?(:domain_name) ? newsmaker.domain_name : newsmaker[:domain_name]) }
old_newsmakers = self.where(:hash_domain => newsmakers_hash_domains).select("id, domain_name, hash_domain")
old_newsmakers_hash_domains = old_newsmakers.map { |newsmaker| newsmaker.hash_domain }
new_newsmakers = newsmakers.reject { |newsmaker| old_newsmakers_hash_domains.include?(hash_from(newsmaker.respond_to?(:domain_name) ? newsmaker.domain_name : newsmaker[:domain_name])) }
created_newsmakers = []
new_newsmakers.each do |newsmaker|
begin
site = Site.new(:domain_name => newsmaker.respond_to?(:domain_name) ? newsmaker.domain_name : newsmaker[:domain_name], :site_type => site_type_by_name(:newsmaker))
created_newsmakers << site if site and !site.domain_name.blank?
rescue => error
logger.debug("Site.create_newsmakers error = #{error.message}")
end
end
#Add slice_by implementation.
if params[:synchronize] == false
import(created_newsmakers, validate: false)
else
import(created_newsmakers, :validate => false, :synchronize => created_newsmakers, :synchronize_keys => [:hash_domain])
SiteMetaParser.add_sites(created_newsmakers)
old_newsmakers + created_newsmakers
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment