Example 1: scrapy itemloader example
def parse_question(self, response):
if "QuestionHeader-title" in response.text:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
Example 2: scrapy itemloader example
def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
loader.add_css('name', u'.modTitle>h1::text')
def parse_category():
for e in response.css(u'.catType>a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('category', list(parse_category()))
loader.add_css('detail', u'.zhiyeShow')
item = loader.load_item()
return FormRequest(
url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
formdata={'code': item['code'][0]},
meta={'item': item},
dont_filter=True,
callback=self.parse_majors
)
Example 3: scrapy itemloader example
def parse(self, response):
l = ItemLoader(item=PlantItem(), response=response)
l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()")
l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()")
l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()")
l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()")
return l.load_item()
Example 4: scrapy itemloader example
def parse_item(self, response):
"""
Extract fields from the individual email page and load them into the
item.
@url http://lkml.iu.edu/hypermail/linux/kernel/0111.3/0036.html
@returns items 1 1
@scrapes senderName senderEmail timeSent timeReceived subject body
@scrapes replyto url
"""
load = ItemLoader(item=Email(), selector=response)
load.add_value('url', response.url)
pattern_replyto = '//ul[1]/li[contains((b|strong), "In reply to:")]'
pattern_replyto += '/a/@href'
link = response.xpath(pattern_replyto).extract()
link = [''] if not link else link
load.add_value('replyto', link[0])
specific_fields = {
'senderName': None,
'senderEmail': None,
'timeSent': None,
'timeReceived': None,
'subject': None
}
new_system = response.xpath('/comment()[1][contains(., "MHonArc")]')
if len(new_system) >= 1:
specific_fields = self.parse_new_system(response, specific_fields)
body_before_comment = '<!--X-Body-of-Message-->'
body_after_comment = '<!--X-Body-of-Message-End-->'
else:
specific_fields = self.parse_old_system(response, specific_fields)
body_before_comment = '<!-- body="start" -->'
body_after_comment = '<!-- body="end" -->'
for key, val in specific_fields.items():
load.add_value(key, val)
if self.get_body:
pattern_body = body_before_comment + '\n?(.*)' + body_after_comment
page_body = response.body.decode('utf-8', 'ignore')
body = re.search(pattern_body, page_body, flags=re.S)
load.add_value('body', body.group(1))
return load.load_item()