Created
July 21, 2018 17:49
-
-
Save Evshved/2914c0f9d1fec8372c2ec1d378bd68b2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "addressable/uri" | |
require "yaml" | |
class Site < ActiveRecord::Base | |
include Utils | |
include BadEntity | |
FEATURED_AT_POPULARITY = 0.5 | |
has_many :articles_topics | |
has_one :site_rating | |
has_many :search_results | |
has_many :startups, :foreign_key => :newsmaker_id, :class_name => "SearchResult" | |
has_many :source_urls, :through => :search_results | |
has_one :url_status | |
has_many :sites_tags | |
has_many :sites_users | |
has_many :users, :through => :sites_users | |
has_one :press_room | |
has_one :site_profile, :foreign_key => :site_id | |
accepts_nested_attributes_for :site_profile, :update_only => true | |
has_many :sites_categories | |
has_many :categories, :through => :sites_categories | |
has_one :companies_site | |
has_many :site_reviews | |
has_one :site_meta_parser | |
has_one :low_quality_site | |
scope :startup, where("site_type = 0") | |
scope :newsmaker, where("site_type = 1") | |
scope :visible, where("visibility_type<>2") | |
validates :domain_name, :presence => true, :allow_blank => false | |
after_initialize :generate_hashes | |
after_initialize :default_values | |
after_save :update_tags | |
before_destroy :destroy_dependences | |
after_update :clear_cache | |
@@search_results_limit = 100 | |
@@tags_limit = 5 | |
@@tags_ttl = PressAbout::Application.config.site_tags_ttl | |
@@site_ttl = PressAbout::Application.config.sites_ttl | |
@@import_slice_count = 10 | |
@@redis = PressAbout::Application.config.redis_server | |
@@sites_parsing_queue_key = "press_about:production:sites_parsing_queue" | |
cattr_reader :redis, :sites_parsing_queue_key, :tags_limit | |
attr_accessor :saved_site_profile, :saved_site_rating | |
attr_reader :new_tags | |
define_index "site_domain_name" do | |
indexes domain_name, :as => :domain_name | |
has created_at | |
has id, :as => :site_id | |
has "case is_english when -1 then 1 when 0 then 0 when 1 then 2 else 1 end", :as => :is_english, :type => :integer, :sort => true | |
has "case bad_rank when bad_rank < 0 then 0 when bad_rank > 0 then 2 else 1 end", :as => :bad_rank, :type => :integer, :sort => true | |
set_property :enable_star => 1 | |
set_property :min_infix_len => 3 | |
end | |
define_index "site_title" do | |
indexes title, :as => :title | |
has created_at | |
has id, :as => :site_id | |
has "case is_english when -1 then 1 when 0 then 0 when 1 then 2 else 1 end", :as => :is_english, :type => :integer, :sort => true | |
has "case bad_rank when bad_rank < 0 then 0 when bad_rank > 0 then 2 else 1 end", :as => :bad_rank, :type => :integer, :sort => true | |
where "title <> ''" | |
end | |
define_index "site_description" do | |
indexes description, :as => :description | |
has "case is_english when -1 then 1 when 0 then 0 when 1 then 2 else 1 end", :as => :is_english, :type => :integer, :sort => true | |
has "case bad_rank when bad_rank < 0 then 0 when bad_rank > 0 then 2 else 1 end", :as => :bad_rank, :type => :integer, :sort => true | |
has created_at | |
has id, :as => :site_id | |
where "title <> ''" | |
end | |
@@site_types = { | |
:startup => 0, | |
:newsmaker => 1 | |
} | |
@@parsing_statuses = { | |
:new => 0, | |
:parsed => 1, | |
:error => 2, | |
:empty => 3, | |
:processing => 4, | |
:not_found => 10 | |
} | |
@@visibility_types = { | |
:default => 0, # обычный проект | |
:press_room => 1, # Верифицированный проект, показываем прессрум | |
:hidden => 2 # скрытый проект | |
} | |
before_validation(:on => :create) do | |
if attribute_present?("domain_name") | |
domain_name = self.class.domain_from_url(self.domain_name) | |
if domain_name.blank? | |
errors.add(:domain_name, "not valid") | |
return false | |
else | |
self.domain_name = domain_name | |
end | |
self.hash_domain = self.class.hash_from(domain_name) | |
unless self.newsmaker? | |
unless domain_exists?(domain_name, 15) | |
errors.add(:domain_name, "not found") | |
return false | |
else | |
answered_domain = answered_domain_name(domain_name) | |
if answered_domain.blank? | |
errors.add(:domain_name, "not answered") | |
return false | |
end | |
if answered_domain != domain_name | |
errors.add(:domain_name, "not found. Maybe '#{answered_domain}'?") | |
return false | |
end | |
end | |
return true | |
end | |
end | |
end | |
validates_each :new_tags do |record, attr, value| | |
if value | |
record.errors.add(:tags, "you can input only #{Site.tags_limit} tags") if value.size > Site.tags_limit | |
record.errors.add(:tags, "tag size should be 3..50 symbols") if value.find { |s| !(3..50).include?(s.size) } | |
record.errors.add(:tags, "tag should be not more than 3 words and contain only letters and numbers") if value.find { |s| !Tag.tag_is_valid?(s) } | |
end | |
end | |
def tags | |
unless @tags | |
site_tags = self.sites_tags.map { |s| s.tag_id }.to_a | |
@tags = site_tags.blank? ? [] : Tag.where(:id => site_tags).to_a | |
end | |
@tags | |
end | |
def tags=(value) | |
@new_tags = Tag.convert_string_to_tags(value.to_s, false) | |
end | |
def update_tags | |
if @new_tags | |
old_tags = self.tags.map { |t| t.value } | |
if @new_tags.sort != old_tags.sort | |
deleted_tags = old_tags - @new_tags | |
new_tags = @new_tags - old_tags | |
SitesTag.delete_all(["site_id = ? and tag_id in (?)", self.id, self.tags.select { |s| deleted_tags.include?(s.value) }]) unless deleted_tags.blank? | |
self.add_tags(new_tags) | |
end | |
end | |
end | |
def clear_cache | |
logger.debug('---clear_cache') | |
self.clear_page_cache('startup') | |
self.clear_page_cache('newsmaker') | |
self.class.delete_site_from_cache(self.id) | |
self.class.delete_limited_tags_from_cache(self.id) | |
end | |
def destroy_dependences | |
# destroy startup dependence | |
self.site_profile.destroy if self.site_profile | |
SiteRating.delete_all(["site_id =?", self.id]) | |
SourceUrl.update_all("check_on_delete = 1", ["id in (?)", self.search_results.map { |source| source.source_url_id }]) | |
# destroy newsmaker dependence | |
SourceUrl.delete_all(["newsmaker_id = ?", self.id]) | |
SearchResult.delete_all(["site_id = ?", self.id]) | |
SearchResult.delete_all(["newsmaker_id = ?", self.id]) | |
# destroy tags dependences | |
SitesTag.delete_all(["site_id = ?", self.id]) | |
PressRoom.delete_all(["site_id = ?", self.id]) | |
PressRelease.delete_all(["press_room_id = ?", self.id]) | |
SiteReview.delete_all(["site_id = ?", self.id]) | |
SitesUser.delete_all(["site_id = ?", self.id]) | |
LowQualitySite.delete_all(["site_id = ?", self.id]) | |
clear_cache | |
end | |
def generate_hashes | |
if attribute_present?("domain_name") | |
self.hash_domain = self.class.hash_from(self.domain_name) | |
end | |
end | |
def domain_url | |
"http://#{self.domain_name}" | |
end | |
def domain_name(force=false) | |
if @domain_name.nil? or force==true | |
@domain_name = read_attribute(:domain_name).to_s.downcase | |
@domain_name = @domain_name[0..-2] if @domain_name.last =~ /\./ | |
end | |
@domain_name | |
end | |
def title_safe | |
unless @title_safe | |
@title_safe = title.blank? ? domain_name : sanitize_string_for_view(title).capitalize_first | |
end | |
@title_safe | |
end | |
def description_safe | |
unless @description_safe | |
@description_safe = description.blank? ? description : sanitize_string_for_view(description[0..1000])[0..500].capitalize_first | |
end | |
@description_safe | |
end | |
def similar_sources(limit=2) | |
similar_string = self.domain_name.split('.')[0..-2].join('.') | |
similars = self.source_urls.each { |source| source.similarity=similar_string.similar(source.title) } | |
similars = similars.sort_by { |s| -s.similarity } | |
end | |
#def company | |
# self.press_room ? self.press_room.company : nil | |
#end | |
# Keywords | |
def calculated_keywords | |
self.class.calculated_keywords(self) | |
end | |
# Visibility type | |
def set_visibility_type(type) | |
value = @@visibility_types[type.to_sym] | |
if value | |
self.update_attribute(:visibility_type, value) unless self.visibility_type == value | |
else | |
errors.add(:visibility_type, "Visibility type not available") | |
return false | |
end | |
self | |
end | |
def get_visibility_type | |
@@visibility_types.key(self.visibility_type) | |
end | |
def visibility_hidden? | |
self.visibility_type == self.class.visibility_type_by_name(:hidden) | |
end | |
def visibility_press_room? | |
self.visibility_type == self.class.visibility_type_by_name(:press_room) | |
end | |
# Parsing Status | |
def set_parsing_status(status) | |
value = @@parsing_statuses[status.to_sym] | |
if value | |
self.update_attribute(:status, value) unless self.status == value | |
else | |
errors.add(:status, "Status not available") | |
return false | |
end | |
self | |
end | |
def get_parsing_status | |
@@parsing_statuses.key(self.status) | |
end | |
def set_as_parsed!(status=:parsed) | |
status_value = @@parsing_statuses[status.to_sym] | |
if status_value | |
self.class.update_all({:status => status_value, :last_parsed_at => Time.now}, ["id = ?", self.id]) | |
self.clear_cache | |
end | |
end | |
# Parsing permissions | |
def last_parsed | |
self.last_parsed_at.blank? ? 'never' : self.last_parsed_at.strftime("%b. %d, %Y") | |
end | |
def allow_parsing? | |
self.last_parsed_at.blank? ? true : (self.last_parsed_at < (Date.today - 1.day)) | |
end | |
def next_parsing_time | |
self.last_parsed_at.blank? ? Date.today : self.last_parsed_at + 1.day | |
end | |
# Favicon | |
def favicon | |
self.class.favicon_for_domain(self.domain_name) | |
end | |
# Last access | |
def check_last_get_access | |
if self.custom? and (self.last_get_access.blank? or self.last_get_access < Date.today) | |
self.update_attribute(:last_get_access, Date.today) | |
end | |
end | |
# Owner type | |
def custom? | |
self.owner_type == 1 | |
end | |
def set_custom | |
self.owner_type = 1 unless self.custom? | |
end | |
def set_custom! | |
self.update_attribute(:owner_type, 1) unless self.custom? | |
end | |
# Ratings | |
#def rating | |
# if self.saved_site_rating | |
# self.saved_site_rating.rating | |
# elsif self.rating | |
# self.site_rating.rating | |
# end | |
#end | |
def rating | |
unless @rating | |
if self.site_rating and self.site_rating.rating>=0 | |
@rating = self.site_rating.rating | |
else | |
@rating = self.reset_rating | |
end | |
end | |
@rating | |
end | |
def reset_rating | |
rating = SiteRating.site_rating(self) | |
SiteRating.delete_all(:site_id => self.id) | |
SiteRating.create(:rating => rating, :site_id => self.id) | |
rating | |
end | |
# Site type | |
def get_site_type | |
@@site_types.key(self.site_type) | |
end | |
def newsmaker? | |
self.site_type == self.class.site_type_by_name(:newsmaker) | |
end | |
def startup? | |
self.site_type == self.class.site_type_by_name(:startup) | |
end | |
def set_as_newsmaker | |
self.site_type = self.class.site_type_by_name(:newsmaker) | |
end | |
def set_as_startup | |
self.site_type = self.class.site_type_by_name(:startup) | |
end | |
def set_as_startup! | |
self.update_attribute(:site_type, self.class.site_type_by_name(:startup)) unless startup? | |
end | |
def update_bad_rank!(force=false) | |
if self.bad_rank == 0 or force==true | |
bad_rank = text_is_bad?(self.bad_entity_sample) ? -1 : 1 | |
self.update_attribute(:bad_rank, bad_rank) | |
SearchResult.update_all("site_bad_rank = #{bad_rank}", ["site_id = ?", self.id]) | |
SitesTag.update_all("site_bad_rank = #{bad_rank}", ["site_id = ?", self.id]) | |
end | |
end | |
# Languages | |
def is_english? | |
self.is_english == 1 | |
end | |
def update_language!(force=false) | |
self.update_attribute(:is_english, text_is_english?([self.title, self.description].join)) if self.is_english<0 or force==true | |
end | |
def set_language | |
self.is_english = text_is_english?([self.title, self.description].join) if !self.title.blank? or !self.description.blank? | |
end | |
# Cache | |
def page_cache_key(type='startup') | |
"page_#{type}_#{self.id}" | |
end | |
def clear_page_cache(type='startup') | |
ActionController::Base.new.expire_fragment(page_cache_key(type)) | |
end | |
# Tags | |
def add_tags(tags=[]) | |
@limited_tags = nil | |
self.class.add_tags_to_site(self, tags) | |
end | |
def correct_limited_tags(limit=5) | |
unless @correct_tags | |
site_tags = SitesTag.where(:site_id => self.id).limit(@@tags_limit*2).select('tag_id').to_a | |
@correct_tags = Tag.where(:id => site_tags.map { |s| s.tag_id }).select { |tag| Tag.tag_is_valid?(tag.value) }.sort_by { |tag| -tag.is_english }.map { |tag| tag.value }.to_a | |
@correct_tags = @correct_tags.first(limit) | |
end | |
@correct_tags | |
end | |
def limited_tags(limit=3) | |
return @limited_tags unless @limited_tags.nil? | |
@limited_tags = self.id.blank? ? [] : self.class.limited_tags_by_site(self.id, limit) | |
end | |
# bad words | |
def bad_entity_sample | |
"#{self.domain_name} #{self.title_safe} #{self.description_safe} #{self.limited_tags.join(" ")}" | |
end | |
def is_good? | |
self.bad_rank > 0 | |
end | |
# logo | |
def logo | |
if self.saved_site_profile | |
self.saved_site_profile.logo | |
elsif self.site_profile | |
self.site_profile.logo | |
end | |
end | |
def have_logo?(force=false) | |
(self.saved_site_profile and self.saved_site_profile.logo_file_name.present?) or | |
(self.site_profile(force) and self.site_profile.logo_file_name.present?) | |
end | |
def verified? | |
@verified = self.sites_users.verified.size > 0 if @verified.nil? | |
@verified | |
end | |
# Custom domain | |
def custom_domain | |
self.site_profile.present? ? self.site_profile.custom_domain : nil | |
end | |
# --------------- | |
def to_param | |
self.domain_name | |
end | |
def exclude_duplicate_results(sources=[]) | |
# Исключить ссылки на этот же домен | |
results = sources.reject { |source| self.class.domain_from_url(source[:url]) == self.domain_name } | |
duplicates = [] | |
# Посчитать хэши для текущего списка | |
results = results.each { |source| source[:hash_url] = self.class.hash_from(source[:url]) } | |
# Получить список хешей старых ссылок | |
source_url_ids = SearchResult.where(:site_id => self.id).select("source_url_id").to_a | |
unless source_url_ids.blank? | |
old_source_urls = SourceUrl.where(:id => source_url_ids.map { |s| s.source_url_id }).select("id, hash_url") | |
old_source_urls_hashes = old_source_urls.map { |source| source.hash_url } | |
# Найти совпадения | |
# Удалить совпадающие урлы из списка | |
duplicates, results = results.partition { |source| old_source_urls_hashes.include?(source[:hash_url]) } | |
# Обновить дату у совпадений | |
duplicates_hashes = duplicates.map { |source| source[:hash_url] } | |
old_source_urls.reject! { |source| !duplicates_hashes.include?(source.hash_url) } | |
SearchResult.update_all('last_reviewed_at = CURDATE()', ["site_id = ? and source_url_id in(?)", self.id, old_source_urls.map { |source| source.id }]) | |
end | |
{:new => results, :old => duplicates} | |
end | |
def update_source_urls(sources=[], params={}) | |
result = {:updated => 0} | |
return result if sources.blank? | |
updated_sources = SourceUrl.initialize_source_urls(sources) | |
source_urls = SourceUrl.where(:hash_url => updated_sources.map { |t| t.hash_url }).select("id, title, description, url, hash_url, is_english, updated_at").to_a | |
search_results = self.search_results.where(:source_url_id => source_urls.map { |t| t.id }).to_a | |
source_urls.each do |source_url| | |
begin | |
source = updated_sources.find { |t| t.hash_url == source_url.hash_url } | |
search_result = search_results.find { |t| t.source_url_id == source_url.id } | |
if source | |
attr = {} | |
attr[:title] = source.title if source.title.present? and source.title!=source_url.title | |
attr[:description] = source.description if source.description.present? and source.description!=source_url.description | |
is_english = SourceUrl.whats_language?(source) | |
attr[:is_english] = is_english if is_english !=source_url.is_english | |
SourceUrl.update_all(attr, ["id=?", source_url.id]) unless attr.blank? | |
end | |
if search_result | |
weight_params = SearchResult.get_weight_params(self, source_url) | |
weight_params = weight_params.merge(:weight => SearchResult.calculate_weight_by_params(weight_params)) | |
SearchResult.update_all(weight_params, ["site_id=? and source_url_id=?", self.id, search_result.source_url_id]) | |
end | |
result[:updated] += 1 | |
rescue => e | |
logger.debug("--update_source_urls error #{e}") | |
end | |
end | |
result | |
end | |
def add_source_urls(sources=[], params={}) | |
result = {:added => 0, :deleted => 0} | |
return result if sources.blank? | |
new_sources = SourceUrl.initialize_source_urls(sources) | |
old_sources = self.load_search_results(:with => [:source_urls, :newsmakers]) | |
# Cформировать список всех статей (url, hash_url, weight, created_at, hash_domain) | |
results = [] | |
deleted_sources = [] | |
new_sources.each do |source| | |
results << { | |
:hash_url => source.hash_url, | |
:weight => SearchResult.calculate_weight(self, source), | |
:weight_factor => SearchResult.weight_factor(Date.today), | |
:hash_domain => source.hash_domain | |
} | |
end | |
old_sources.each do |source| | |
if source.newsmaker and source.source_url | |
results << {:hash_url => source.source_url.hash_url, :weight => source.weight, :weight_factor => SearchResult.weight_factor(source.created_at.nil? ? Date.today : source.created_at), | |
:hash_domain => source.newsmaker.hash_domain} | |
else | |
deleted_sources.push(source) | |
end | |
end | |
# Отсортировать по весу и коэффициенту | |
results = results.sort_by { |source| -source[:weight]*source[:weight_factor] } | |
# Уникализировать по hash_domain | |
results = results.uniq { |source| source[:hash_domain] } | |
# Убрать все что ниже лимита | |
results = results[0..@@search_results_limit-1] | |
# Список на сохранение данных | |
results_hashes = results.map { |result| result[:hash_url] } | |
new_sources = new_sources.reject { |source| !results_hashes.include?(source.hash_url) } | |
# Список на удаление данных | |
deleted_sources += old_sources.reject { |source| source.source_url.present? and results_hashes.include?(source.source_url.hash_url) } | |
# Добавляем в БД из new(newsmakers -> source_urls -> search_results) | |
new_source_urls=[] | |
newsmakers = self.class.create_newsmakers(new_sources) | |
new_sources.each do |new_source_url| | |
begin | |
newsmaker = newsmakers.find { |newsmaker| newsmaker.hash_domain==new_source_url.hash_domain } | |
if newsmaker | |
new_source_url.newsmaker_id = newsmaker.id | |
else | |
newsmaker = Site.create(:domain_name => new_source_url.domain_name, :site_type => Site.site_type_by_name(:newsmaker)) | |
new_source_url.newsmaker_id = newsmaker.id if newsmaker.persisted? | |
end | |
new_source_urls << new_source_url | |
rescue => e | |
logger.error("Site.add_source_urls #{e.class}: #{e.message} | url = #{new_source_url.url}") | |
end | |
end | |
SourceUrl.import(new_source_urls, :ignore => true, :validate => false, :synchronize => new_source_urls, :synchronize_keys => [:hash_url]) | |
result[:added]=new_source_urls.size | |
Erasers::SourceUrlsEraser.remove_from_queue(new_source_urls.map { |source| source.id }, false) | |
if params[:update_statuses]==true | |
new_source_urls.each do |source_url| | |
source_url.update_status(source_url.status) if source_url.status | |
end | |
end | |
new_search_results=[] | |
new_source_urls.each do |source_url| | |
new_search_results << SearchResult.initialize_by_site_and_source_url(self, source_url) | |
end | |
SearchResult.import(new_search_results, :validate => false) | |
# Удаляем из БД то, что в deleted | |
unless deleted_sources.empty? | |
result[:deleted]=deleted_sources.size | |
SearchResult.delete_all(["site_id = ? and source_url_id in (?)", self.id, deleted_sources.map { |source| source.source_url_id }]) | |
# Помечаем в проверку на удаление source_urls, те которые были в deleted | |
#SourceUrl.update_all("check_on_delete = 1", ["id in (?)", deleted_sources.map { |source| source.source_url_id }]) | |
Erasers::SourceUrlsEraser.add_to_queue(deleted_sources.map { |source| source.source_url_id }) | |
end | |
result | |
end | |
def load_search_results(options={}) | |
search_results = self.search_results | |
if options[:with] | |
source_urls = [] | |
newsmakers = [] | |
if options[:with].include?(:source_urls) | |
source_urls = SourceUrl.where(:id => search_results.map { |s| s.source_url_id }).select("id, hash_url, newsmaker_id") | |
end | |
if options[:with].include?(:newsmakers) | |
newsmakers = Site.where(:id => search_results.map { |s| s.newsmaker_id }).select("id, hash_domain") | |
end | |
search_results.each do |search_result| | |
search_result.source_url = source_urls.find { |s| s.id == search_result.source_url_id } | |
search_result.newsmaker = newsmakers.find { |s| s.id == search_result.newsmaker_id } | |
end | |
end | |
#self.search_results.includes(:source_url, :newsmaker) | |
search_results | |
end | |
def add_to_searching(params={}) | |
uuid = "site_#{md5(self.domain_name)}" | |
case params[:priority] | |
when 'googlebot' | |
self.class.redis.rpush(self.class.sites_parsing_queue_key, self.domain_name) | |
when 'user' | |
UserSearchParsingWorker.create(:domain_name => self.domain_name, :id => self.id, :uuid => "#{uuid}_user") | |
when 'high' | |
HighSearchParsingWorker.create(:domain_name => self.domain_name, :id => self.id, :uuid => "#{uuid}_high") | |
else | |
SearchParsingWorker.create(:domain_name => self.domain_name, :id => self.id, :uuid => uuid) | |
end | |
end | |
def set_url_status(status="") | |
unless status.blank? | |
unless self.url_status | |
self.create_url_status(:status => status) | |
else | |
self.url_status.update_attributes(:status => status) | |
end | |
else | |
self.url_status.destroy if self.url_status | |
end | |
end | |
def as_json(options={}) | |
site = super | |
site = site.merge(:site_profile => self.site_profile.as_json) if self.site_profile | |
site | |
end | |
# API | |
def json_profile | |
{ | |
:domain_name => self.domain_name, | |
:title => self.title_safe.to_s, | |
:description => self.description_safe.to_s | |
} | |
end | |
# Topics | |
def latest_topics(limit=5, max_limit=50) | |
unless @latest_topics | |
@latest_topics = [] | |
articles_topics = ArticlesTopic.select("topic_id").where(:site_id => self.id).order("article_id DESC").limit(max_limit).to_a | |
if articles_topics.present? | |
articles_topics = articles_topics.map { |a| a.topic_id }.uniq.first(limit) | |
topics = Topic.select("id, value").where(:id => articles_topics).to_a | |
articles_topics.each { |a| @latest_topics.push(topics.find { |t| t.id == a }) } | |
@latest_topics.compact! | |
end | |
end | |
@latest_topics | |
end | |
# Featured at | |
def featured_at(limit=6) | |
results = [] | |
search_results = self.search_results.select("newsmaker_id, rating, logo_file_name"). | |
joins("left join site_ratings on newsmaker_id = site_ratings.site_id"). | |
joins("left join site_profiles on newsmaker_id = site_profiles.site_id").to_a | |
search_results.select! { |s| s.rating.present? and s.logo_file_name.present? and s.rating >= FEATURED_AT_POPULARITY } | |
search_results.sort! { |a, b| b.rating<=>a.rating } | |
search_results = search_results.first(limit) | |
if search_results.present? | |
sites = Site.where(:id => search_results.map(&:newsmaker_id)).to_a | |
search_results.each do |search_result| | |
site = sites.find { |s| s.id==search_result.newsmaker_id } | |
unless site.blank? | |
site.saved_site_profile = SiteProfile.new(:logo_file_name => search_result.logo_file_name) | |
site.saved_site_profile.site_id = search_result.newsmaker_id | |
results << site | |
end | |
end | |
end | |
results | |
end | |
private | |
def prepare_source_urls(sources=[]) | |
sources.each do |source| | |
source[:hash_url] = self.class.hash_from(source[:url]) | |
end | |
end | |
def default_values | |
unless self.persisted? | |
self.title ||= '' | |
self.description ||= '' | |
self.last_parsed_at ||= false | |
self.related_parsed_at ||= false | |
self.last_get_access ||= false | |
self.created_at ||= false | |
self.updated_at ||= false | |
self.set_language | |
end | |
rescue ActiveModel::MissingAttributeError | |
end | |
class << self | |
include Utils | |
def default_domain_name | |
"stickr.com" | |
end | |
# Favicon | |
def favicon_for_domain(domain_name) | |
#"http://www.google.com/s2/favicons?domain_url=http://#{domain_name}" | |
#"http://faviget.appspot.com/http://#{domain_name}" | |
"http://www.google.com/s2/favicons?domain_url=http://#{domain_name}" | |
end | |
# Tags | |
def add_tags_to_site(site, tags=[]) | |
return if tags.empty? | |
# создаем новые теги | |
tags = Tag.create_tags(tags) | |
# создаем связи с сайтом | |
new_site_tags = [] | |
tags.each do |tag| | |
new_site_tags << SitesTag.new(:site_id => site.id, :tag_id => tag.id) | |
end | |
SitesTag.import(new_site_tags, :validate => false, :ignore => true, :slice_by => @@import_slice_count) | |
# Ставим флаг что сайту добавлены теги | |
site | |
end | |
def tags_cache_key(site_id) | |
"site_tags:#{site_id}" | |
end | |
def delete_limited_tags_from_cache(site_id) | |
Rails.cache.delete(tags_cache_key(site_id)) | |
end | |
def limited_tags_by_site(site_id, limit=3) | |
tags = Rails.cache.fetch(tags_cache_key(site_id), :expires_in => @@tags_ttl) do | |
site_tags = SitesTag.where(:site_id => site_id).limit(@@tags_limit).select('tag_id').to_a | |
site_tags.blank? ? [] : Tag.where(:id => site_tags.map { |s| s.tag_id }).sort_by { |tag| -tag.is_english }.map { |tag| tag.value }.to_a | |
end | |
tags.blank? ? [] : tags.sort.first(limit) | |
end | |
# Cache | |
def site_cache_key(site_id) | |
"site:#{site_id}" | |
end | |
def delete_site_from_cache(site_id) | |
Rails.cache.delete(site_cache_key(site_id)) | |
end | |
def read_site_from_cache(site_id) | |
Rails.cache.read(site_cache_key(site_id), :raw => true) | |
end | |
def write_site_to_cache(site) | |
begin | |
raise "site blank" unless site | |
dump = dump(site) | |
Rails.cache.write(site_cache_key(site.id), dump, :expires_in => @@site_ttl, :raw => true) | |
rescue => e | |
logger.debug("--#{e}") | |
end | |
end | |
def load_site_by_id(site_id) | |
site = read_site_from_cache(site_id) | |
unless site | |
site = find(site_id) | |
write_site_to_cache(site) | |
end | |
site | |
end | |
def dump(site) | |
dump = ActiveSupport::JSON.encode(site) | |
dump | |
end | |
def load(dump) | |
decode = ActiveSupport::JSON.decode(dump) | |
site = new(decode.except('id', 'site_profile')) | |
site.id= decode['id'] | |
if decode['site_profile'].present? | |
site.saved_site_profile = SiteProfile.new(decode['site_profile'].except('site_id')) | |
site.saved_site_profile.site_id =decode['site_profile']['site_id'] | |
end | |
site | |
end | |
def load_sites_by_id(site_ids=[], allow_caching=true, params={}) | |
ids = site_ids.is_a?(Array) ? site_ids : [site_ids] | |
sites = [] | |
time_interval("---load_sites_by_id") do | |
dumps = [] | |
if allow_caching == true | |
begin | |
ids.each { |id| dumps.push(read_site_from_cache(id)) } | |
dumps.each do |dump| | |
sites.push(load(dump)) if dump | |
end | |
rescue => e | |
logger.debug(e) | |
ids.each { |id| delete_site_from_cache(id) } | |
end | |
sites = sites.compact | |
end | |
cached_ids = sites.map { |site| site['id'] } | |
not_cached_ids = ids.reject { |id| cached_ids.include?(id) } | |
unless not_cached_ids.empty? | |
not_cached_sites = where("id in (?)", not_cached_ids).includes(:site_profile).to_a | |
not_cached_sites.each { |site| write_site_to_cache(site) } if allow_caching == true | |
sites += not_cached_sites | |
sites.each { |s| s.limited_tags } | |
end | |
end | |
sites | |
end | |
# Language | |
def is_english?(site) | |
site['is_english'] == 1 | |
end | |
def whats_language?(site) | |
if !site['title'].blank? or !site['description'].blank? | |
text_is_english?([site['title'], site['description']].join) ? 1 : 0 | |
else | |
-1 | |
end | |
end | |
# Keywords | |
def calculated_keywords(site) | |
search_words_composition(site['domain_name'], index_keywords_from_text([site['title'], site['description']].join(' '))) | |
end | |
# Sites types | |
def site_type_by_name(key) | |
@@site_types[key.to_sym] | |
end | |
# Visibility type | |
def visibility_types | |
@@visibility_types | |
end | |
def list_visibility_types | |
@@visibility_types.except(:default) | |
end | |
def visibility_type_by_name(name) | |
@@visibility_types[name.to_sym] | |
end | |
# Parsing status | |
def parsing_status_by_name(status_name) | |
@@parsing_statuses[status_name.to_sym] | |
end | |
def hash_from(source) | |
Digest::MD5.hexdigest(source.strip) unless source.blank? | |
end | |
def add_group_sites_to_parsing(ids=[]) | |
unless ids.blank? | |
uuid = "meta_parsing_#{hash_from(ids.join(""))}" | |
SitesMetaParsingWorker.create(:ids => ids, :uuid => uuid) | |
end | |
end | |
def add_sites_to_searching | |
counter = redis.llen(sites_parsing_queue_key).to_i | |
(0..counter).each do | |
domain_name = redis.lpop sites_parsing_queue_key | |
unless domain_name.blank? | |
site = Site.new(:domain_name => domain_name) | |
site.add_to_searching(:priority => 'high') | |
end | |
end | |
counter | |
end | |
def create_newsmakers(newsmakers=[], params={}) | |
newsmakers_hash_domains = newsmakers.map { |newsmaker| hash_from(newsmaker.respond_to?(:domain_name) ? newsmaker.domain_name : newsmaker[:domain_name]) } | |
old_newsmakers = self.where(:hash_domain => newsmakers_hash_domains).select("id, domain_name, hash_domain") | |
old_newsmakers_hash_domains = old_newsmakers.map { |newsmaker| newsmaker.hash_domain } | |
new_newsmakers = newsmakers.reject { |newsmaker| old_newsmakers_hash_domains.include?(hash_from(newsmaker.respond_to?(:domain_name) ? newsmaker.domain_name : newsmaker[:domain_name])) } | |
created_newsmakers = [] | |
new_newsmakers.each do |newsmaker| | |
begin | |
site = Site.new(:domain_name => newsmaker.respond_to?(:domain_name) ? newsmaker.domain_name : newsmaker[:domain_name], :site_type => site_type_by_name(:newsmaker)) | |
created_newsmakers << site if site and !site.domain_name.blank? | |
rescue => error | |
logger.debug("Site.create_newsmakers error = #{error.message}") | |
end | |
end | |
#Add slice_by implementation. | |
if params[:synchronize] == false | |
import(created_newsmakers, validate: false) | |
else | |
import(created_newsmakers, :validate => false, :synchronize => created_newsmakers, :synchronize_keys => [:hash_domain]) | |
SiteMetaParser.add_sites(created_newsmakers) | |
old_newsmakers + created_newsmakers | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment