Created
April 30, 2017 04:51
-
-
Save botmtl/5fe7bb029b5ed12cdb5e0f297646ec8e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright 2013 Fanficdownloader team, 2017 FanFicFare team | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
import time | |
import logging | |
logger = logging.getLogger(__name__) | |
import re | |
import urllib2 | |
import urlparse | |
import time | |
import os | |
from bs4.element import Comment | |
from ..htmlcleanup import stripHTML | |
from .. import exceptions as exceptions | |
import sys | |
from base_adapter import BaseSiteAdapter, makeDate | |
def getClass(): | |
return NiftyOrgAdapter | |
class NiftyOrgAdapter(BaseSiteAdapter): | |
def __init__(self, config, url): | |
BaseSiteAdapter.__init__(self, config, url) | |
self.story.setMetadata('siteabbrev', 'nifty') | |
# Extract story ID from base URL, http://www.asexstories.com/Halloween-party-with-the-phantom/ | |
storyId = self.parsedUrl.path | |
self.story.setMetadata('storyId', storyId) | |
self.story.addToList('category', self.parsedUrl.path.split('/',)[2]) | |
self.story.addToList('category', self.parsedUrl.path.split('/', )[3]) | |
title = self.parsedUrl.path.split('/',)[4].replace('-', ' ').title() | |
self.story.setMetadata('title', title) | |
self.story.setMetadata('authorUrl', 'http://www.nifty.org') | |
self.story.setMetadata('author', 'Nifty Writer') | |
self.story.setMetadata('authorId', 1) | |
## set url | |
self._setURL(url) | |
@staticmethod | |
def getSiteDomain(): | |
return 'www.nifty.org' | |
@classmethod | |
def getAcceptDomains(cls): | |
return ['www.nifty.org', 'http://www.asstr.org'] | |
@classmethod | |
def getSiteExampleURLs(cls): | |
return "http://www.nifty.org/nifty/Genre/SubGenre/StoryTitle/ https://www.nifty.org/nifty/Genre/SubGenre/StoryTitle/" | |
def getSiteURLPattern(self): | |
return r"https?:\/\/\www\.nifty.org\/nifty\/([a-zA-Z0-9_-]+)\/([a-zA-Z0-9_-]+)\/(.*)" | |
def extractChapterUrlsAndMetadata(self): | |
""" | |
Chapters are located at /StoryName/ (for single-chapter | |
stories), or //StoryName/index#.html for multiple chapters (# is a | |
non-padded incrementing number, like StoryName1, StoryName2.html, ..., | |
StoryName10.html) | |
This site doesn't have much in the way of metadata, except on the | |
Category and Tags index pages. so we will get what we can. | |
Also, as this is an Adult site, the is_adult check is mandatory. | |
""" | |
if not (self.is_adult or self.getConfig("is_adult")): | |
raise exceptions.AdultCheckRequired(self.url) | |
try: | |
data1 = self._fetchUrl(self.url) | |
soup1 = self.make_soup(data1) | |
# strip comments from soup | |
#[comment.extract() for comment in soup1.find_all(text=lambda text: isinstance(text, Comment))] | |
except urllib2.HTTPError, e: | |
if e.code == 404: | |
raise exceptions.StoryDoesNotExist(self.url) | |
else: | |
raise e | |
if 'Page Not Found.' in data1: | |
raise exceptions.StoryDoesNotExist(self.url) | |
url = self.url | |
# Get chapter URLs | |
self.chapterUrls = [] | |
chapterTable = soup1.select('td > a') | |
if chapterTable is not None: | |
# Multi-chapter story | |
for page in chapterTable: | |
chapterTitle = urlparse.urljoin(self.url, page['href']).replace('-', ' ').title() | |
chapterUrl = urlparse.urljoin(self.url, page['href']) | |
self.chapterUrls.insert(0, (chapterTitle, chapterUrl)) | |
else: | |
self.chapterUrls = [url] | |
self.setStoryMetadata('author', 'foundintext') | |
self.story.setMetadata('numChapters', len(self.chapterUrls)) | |
logger.debug("Story: <%s>", self.story) | |
return | |
def getChapterText(self, url): | |
logger.debug('Getting chapter text from <%s>' % url) | |
# logger.info('Getting chapter text from <%s>' % url) | |
data1 = self._fetchUrl(url) | |
storyauthor = re.search('From:(.*?)\n',data1).group(1) | |
#self.story.setMetadata('author', storyauthor) | |
#self.story.setMetadata('authorid', url) | |
storycontent = '<div>' + re.sub('<br>', ' ', '<p>' + re.sub('<br>\s*<br>', '</p><p>', (data1.replace('\n', '<br>')))) + '</div>' | |
story1 = self.make_soup(storycontent).decode() | |
# get story text | |
return self.utf8FromSoup(url, story1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment