itemloader list code example
Example 1: scrapy itemloader example
def parse_question(self, response):
if "QuestionHeader-title" in response.text:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
Example 2: scrapy itemloader example
def parse_first_page(self, response):
count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0])
title = response.request.cookies['title']
albumURL = response.url.replace(".shtml", '')
for x in xrange(1,count+1):
suffix = ".shtml"
if x > 1:
suffix = "_"+str(x)+".shtml"
request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
yield request
l = ItemLoader(item=PageItem(), response=response)
l.add_value('title', title)
l.add_value('name', self.name)
l.add_value('url', response.url)
l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
yield l.load_item()