Skip to content

Instantly share code, notes, and snippets.

View rafikahmed's full-sized avatar

Ahmed Rafik Djerah rafikahmed

View GitHub Profile
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>Xpath Syntax</title>
</head>
@rafikahmed
rafikahmed / index.html
Created September 27, 2018 17:54
HTML Document
<div>
<a href='www.example.com'>Link</a>
</div>
@rafikahmed
rafikahmed / index.html
Created September 27, 2018 18:22
XPath
<p class='someClass'>Paragraph 1</p>
<p id='someId'>Paragraph 2</p>
@rafikahmed
rafikahmed / jokes.py
Last active September 27, 2018 19:00
import scrapy
class JokesSpider(scrapy.Spider):
name= 'jokes'
allowed_domains = ['www.laughfactory.com']
start_urls = [
'http://www.laughfactory.com/jokes/family-jokes'
]
def parse(self, response):
@rafikahmed
rafikahmed / jokes.py
Last active September 29, 2018 10:01
def parse(self, response):
for joke in response.xpath("//div[@class='jokes']"):
yield {
'joke_text': joke.xpath(".//div[@class='joke-text']/p").extract_first()
}
next_page= response.xpath("//li[@class='next']/a/@href").extract_first()
if next_page is not None:
next_page_link= response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback=self.parse)
class JokeItem(scrapy.Item):
joke_text= scrapy.Field()
import scrapy
from demo_project.items import JokeItem
from scrapy.loader import ItemLoader
class JokesSpider(scrapy.Spider):
name= 'jokes'
allowed_domais = ['www.laughfactory.com']
start_urls = [
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags
def remove_whitespace(value):
return value.strip()
class JokeItem(scrapy.Item):
joke_text= scrapy.Field(
input_processor= MapCompose(remove_tags, remove_whitespace),
__pycache__/
.vscode/
build/
dbs/
eggs/
project.egg-info/
*.json
*.csv