puntonim · May 13, 2014 19:54
diff --git a/manager-example.py b/manager-example.py
 class ScraperManager:
    def __init__(self):
        self.scraper = ThisIsTheScraperYouWroteWithScrapy()

    def start_session(self):
        # Loop over all elements that Scrapy finds.
        for files, metadata in self.scraper.run():
            # `files`: a list of URLs of actual files to download for the current paper, like:
            #  ['http://www.narcis.com/paper1-part1.pdf', 'http://www.narcis.com/paper1-part2.pdf']
            # `metadata`: a dictionary of metadata for the current paper, like:
            #  {'title': 'My title1', 'author': 'John Doe'}
            #
            # What will I do with `files` and `metadata`?
            # This print is fake of course. In another part of the code I will create a model for
            # the actual downloaded files and a model for the metadata. Here I will instantiate
            # and save `files` and `metadata` in such models. The model designed to store `files`
            # will take care of downloading the actual files and store them in the right storage
            # folder (which depends on the ID assigned by the database).
            # Note: I saw you made Scrapy download the paper, I guess it's better to change this
            # and make Scrapy return only the URLs; it will then be in charge of the model to 
            # download and store the files. 
            print('%s - %s' % (files, metadata))


 class ThisIsTheScraperYouWroteWithScrapy:
    """
    This is a fake code which mimics the code that you have alreadt written with Scrapy.
    You just have to edit your code in order to yield 2 things every time it scrapes a new paper:
     - a list of URLs of actual files to download (usually 1 but they can be more)
     - a dictionary of metadata
    The code here is an example to show you how to use yield. Think of `yield` as a 
    `return something` and when done come back and go on starting from the next line.
    """
    def run(self):
        yield ['http://www.narcis.com/paper1-part1.pdf', 'http://www.narcis.com/paper1-part2.pdf'],\
              {'title': 'My title1', 'author': 'John Doe'}
        yield ['http://www.narcis.com/paper2.pdf'], {'title': 'My title2', 'author': 'Jane Doe'}
        yield ['http://www.narcis.com/paper3.pdf'], {'title': 'My title3', 'author': 'Johannes Doe'}
        yield ['http://www.narcis.com/paper4-part1.pdf', 'http://www.narcis.com/paper4-part2.pdf'],\
              {'title': 'My title4', 'author': 'John Doe'}


 if __name__ == '__main__':
    manager = ScraperManager()
    manager.start_session()
diff --git a/manager.py b/manager.py
 class ScraperManager:
    ...

    def start_session(self):
        # Loop over all elements that Scrapy finds.
        for files, metadata in self.scraper.run():
            # - `files`: a list of URLs of actual files to download for the current paper, like:
            # ['http://www.narcis.com/paper1-part1.pdf', 'http://www.narcis.com/paper1-part2.pdf']
            # - `metadata`: a dictionary of metadata for the current paper, like:
            # {'title': 'My title1', 'author': 'John Doe'}
            # - `self.scraper.run()` is a call to the scraper that you have written. You just have
            # to edit your code in order to *yield* 2 things every time it scrapes a new paper:
            # `files` and `metadata`.
            #
            # What will I do with `files` and `metadata`?
            # This print is fake of course. In another part of the code I will create a model for
            # the actual downloaded files and a model for the metadata. Here I will instantiate
            # and save `files` and `metadata` in such models. The model designed to store `files`
            # will take care of downloading the actual files and store them in the right storage
            # folder (which depends on the ID assigned by the database).
            # Note: I saw you made Scrapy download the paper, I guess it's better to change this
            # and make Scrapy return only the URLs; it will then be in charge of the model to 
            # download and store the files. 
            print('%s - %s' % (files, metadata))


 if __name__ == '__main__':
    manager = ScraperManager()
    manager.start_session()
	class ScraperManager:
	def __init__(self):
	self.scraper = ThisIsTheScraperYouWroteWithScrapy()

	def start_session(self):
	# Loop over all elements that Scrapy finds.
	for files, metadata in self.scraper.run():
	# `files`: a list of URLs of actual files to download for the current paper, like:
	# ['http://www.narcis.com/paper1-part1.pdf', 'http://www.narcis.com/paper1-part2.pdf']
	# `metadata`: a dictionary of metadata for the current paper, like:
	# {'title': 'My title1', 'author': 'John Doe'}
	#
	# What will I do with `files` and `metadata`?
	# This print is fake of course. In another part of the code I will create a model for
	# the actual downloaded files and a model for the metadata. Here I will instantiate
	# and save `files` and `metadata` in such models. The model designed to store `files`
	# will take care of downloading the actual files and store them in the right storage
	# folder (which depends on the ID assigned by the database).
	# Note: I saw you made Scrapy download the paper, I guess it's better to change this
	# and make Scrapy return only the URLs; it will then be in charge of the model to
	# download and store the files.
	print('%s - %s' % (files, metadata))


	class ThisIsTheScraperYouWroteWithScrapy:
	"""
	This is a fake code which mimics the code that you have alreadt written with Scrapy.
	You just have to edit your code in order to yield 2 things every time it scrapes a new paper:
	- a list of URLs of actual files to download (usually 1 but they can be more)
	- a dictionary of metadata
	The code here is an example to show you how to use yield. Think of `yield` as a
	`return something` and when done come back and go on starting from the next line.
	"""
	def run(self):
	yield ['http://www.narcis.com/paper1-part1.pdf', 'http://www.narcis.com/paper1-part2.pdf'],\
	{'title': 'My title1', 'author': 'John Doe'}
	yield ['http://www.narcis.com/paper2.pdf'], {'title': 'My title2', 'author': 'Jane Doe'}
	yield ['http://www.narcis.com/paper3.pdf'], {'title': 'My title3', 'author': 'Johannes Doe'}
	yield ['http://www.narcis.com/paper4-part1.pdf', 'http://www.narcis.com/paper4-part2.pdf'],\
	{'title': 'My title4', 'author': 'John Doe'}


	if __name__ == '__main__':
	manager = ScraperManager()
	manager.start_session()