botmtl · April 30, 2017 04:51
diff --git a/adapter_niftyorg.py b/adapter_niftyorg.py
 # -*- coding: utf-8 -*-

 # Copyright 2013 Fanficdownloader team, 2017 FanFicFare team
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 import time
 import logging

 logger = logging.getLogger(__name__)
 import re
 import urllib2
 import urlparse
 import time
 import os

 from bs4.element import Comment
 from ..htmlcleanup import stripHTML
 from .. import exceptions as exceptions
 import sys

 from base_adapter import BaseSiteAdapter, makeDate


 def getClass():
    return NiftyOrgAdapter


 class NiftyOrgAdapter(BaseSiteAdapter):
    def __init__(self, config, url):
        BaseSiteAdapter.__init__(self, config, url)

        self.story.setMetadata('siteabbrev', 'nifty')

        # Extract story ID from base URL, http://www.asexstories.com/Halloween-party-with-the-phantom/
        storyId = self.parsedUrl.path
        self.story.setMetadata('storyId', storyId)
        self.story.addToList('category', self.parsedUrl.path.split('/',)[2])
        self.story.addToList('category', self.parsedUrl.path.split('/', )[3])
        title = self.parsedUrl.path.split('/',)[4].replace('-', ' ').title()
        self.story.setMetadata('title', title)
        self.story.setMetadata('authorUrl', 'http://www.nifty.org')
        self.story.setMetadata('author', 'Nifty Writer')
        self.story.setMetadata('authorId', 1)

        ## set url
        self._setURL(url)

    @staticmethod
    def getSiteDomain():
        return 'www.nifty.org'

    @classmethod
    def getAcceptDomains(cls):
        return ['www.nifty.org', 'http://www.asstr.org']

    @classmethod
    def getSiteExampleURLs(cls):
        return "http://www.nifty.org/nifty/Genre/SubGenre/StoryTitle/ https://www.nifty.org/nifty/Genre/SubGenre/StoryTitle/"

    def getSiteURLPattern(self):
        return r"https?:\/\/\www\.nifty.org\/nifty\/([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/(.*)"

    def extractChapterUrlsAndMetadata(self):
        """
        Chapters are located at /StoryName/  (for single-chapter
        stories), or //StoryName/index#.html for multiple chapters (# is a
        non-padded incrementing number, like StoryName1, StoryName2.html, ...,
        StoryName10.html)

        This site doesn't have much in the way of metadata, except on the 
        Category and Tags index pages. so we will get what we can.

        Also, as this is an Adult site, the is_adult check is mandatory.
        """

        if not (self.is_adult or self.getConfig("is_adult")):
            raise exceptions.AdultCheckRequired(self.url)

        try:
            data1 = self._fetchUrl(self.url)
            soup1 = self.make_soup(data1)
            # strip comments from soup
            #[comment.extract() for comment in soup1.find_all(text=lambda text: isinstance(text, Comment))]
        except urllib2.HTTPError, e:
            if e.code == 404:
                raise exceptions.StoryDoesNotExist(self.url)
            else:
                raise e

        if 'Page Not Found.' in data1:
            raise exceptions.StoryDoesNotExist(self.url)

        url = self.url

        # Get chapter URLs
        self.chapterUrls = []
        chapterTable = soup1.select('td > a')

        if chapterTable is not None:
            # Multi-chapter story
            for page in chapterTable:
                chapterTitle = urlparse.urljoin(self.url, page['href']).replace('-', ' ').title()
                chapterUrl = urlparse.urljoin(self.url, page['href'])
                self.chapterUrls.insert(0, (chapterTitle, chapterUrl))
        else:
            self.chapterUrls = [url]
            self.setStoryMetadata('author', 'foundintext')


        self.story.setMetadata('numChapters', len(self.chapterUrls))
        logger.debug("Story: <%s>", self.story)

        return

    def getChapterText(self, url):
        logger.debug('Getting chapter text from <%s>' % url)
        # logger.info('Getting chapter text from <%s>' % url)

        data1 = self._fetchUrl(url)
        storyauthor = re.search('From:(.*?)\n',data1).group(1)
        #self.story.setMetadata('author', storyauthor)
        #self.story.setMetadata('authorid', url)
        storycontent = '<div>' + re.sub('<br>', ' ', '<p>' + re.sub('<br>\s*<br>', '</p><p>', (data1.replace('\n', '<br>')))) + '</div>'
        story1 = self.make_soup(storycontent).decode()
        # get story text
        return self.utf8FromSoup(url, story1)
	# -- coding: utf-8 --

	# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import time
	import logging

	logger = logging.getLogger(__name__)
	import re
	import urllib2
	import urlparse
	import time
	import os

	from bs4.element import Comment
	from ..htmlcleanup import stripHTML
	from .. import exceptions as exceptions
	import sys

	from base_adapter import BaseSiteAdapter, makeDate


	def getClass():
	return NiftyOrgAdapter


	class NiftyOrgAdapter(BaseSiteAdapter):
	def __init__(self, config, url):
	BaseSiteAdapter.__init__(self, config, url)

	self.story.setMetadata('siteabbrev', 'nifty')

	# Extract story ID from base URL, http://www.asexstories.com/Halloween-party-with-the-phantom/
	storyId = self.parsedUrl.path
	self.story.setMetadata('storyId', storyId)
	self.story.addToList('category', self.parsedUrl.path.split('/',)[2])
	self.story.addToList('category', self.parsedUrl.path.split('/', )[3])
	title = self.parsedUrl.path.split('/',)[4].replace('-', ' ').title()
	self.story.setMetadata('title', title)
	self.story.setMetadata('authorUrl', 'http://www.nifty.org')
	self.story.setMetadata('author', 'Nifty Writer')
	self.story.setMetadata('authorId', 1)

	## set url
	self._setURL(url)

	@staticmethod
	def getSiteDomain():
	return 'www.nifty.org'

	@classmethod
	def getAcceptDomains(cls):
	return ['www.nifty.org', 'http://www.asstr.org']

	@classmethod
	def getSiteExampleURLs(cls):
	return "http://www.nifty.org/nifty/Genre/SubGenre/StoryTitle/ https://www.nifty.org/nifty/Genre/SubGenre/StoryTitle/"

	def getSiteURLPattern(self):
	return r"https?:\/\/\www\.nifty.org\/nifty\/([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/(.*)"

	def extractChapterUrlsAndMetadata(self):
	"""
	Chapters are located at /StoryName/ (for single-chapter
	stories), or //StoryName/index#.html for multiple chapters (# is a
	non-padded incrementing number, like StoryName1, StoryName2.html, ...,
	StoryName10.html)

	This site doesn't have much in the way of metadata, except on the
	Category and Tags index pages. so we will get what we can.

	Also, as this is an Adult site, the is_adult check is mandatory.
	"""

	if not (self.is_adult or self.getConfig("is_adult")):
	raise exceptions.AdultCheckRequired(self.url)

	try:
	data1 = self._fetchUrl(self.url)
	soup1 = self.make_soup(data1)
	# strip comments from soup
	#[comment.extract() for comment in soup1.find_all(text=lambda text: isinstance(text, Comment))]
	except urllib2.HTTPError, e:
	if e.code == 404:
	raise exceptions.StoryDoesNotExist(self.url)
	else:
	raise e

	if 'Page Not Found.' in data1:
	raise exceptions.StoryDoesNotExist(self.url)

	url = self.url

	# Get chapter URLs
	self.chapterUrls = []
	chapterTable = soup1.select('td > a')

	if chapterTable is not None:
	# Multi-chapter story
	for page in chapterTable:
	chapterTitle = urlparse.urljoin(self.url, page['href']).replace('-', ' ').title()
	chapterUrl = urlparse.urljoin(self.url, page['href'])
	self.chapterUrls.insert(0, (chapterTitle, chapterUrl))
	else:
	self.chapterUrls = [url]
	self.setStoryMetadata('author', 'foundintext')


	self.story.setMetadata('numChapters', len(self.chapterUrls))
	logger.debug("Story: <%s>", self.story)

	return

	def getChapterText(self, url):
	logger.debug('Getting chapter text from <%s>' % url)
	# logger.info('Getting chapter text from <%s>' % url)

	data1 = self._fetchUrl(url)
	storyauthor = re.search('From:(.*?)\n',data1).group(1)
	#self.story.setMetadata('author', storyauthor)
	#self.story.setMetadata('authorid', url)
	storycontent = '<div>' + re.sub('<br>', ' ', '<p>' + re.sub('<br>\s*<br>', '</p><p>', (data1.replace('\n', '<br>')))) + '</div>'
	story1 = self.make_soup(storycontent).decode()
	# get story text
	return self.utf8FromSoup(url, story1)