scrapy debug itemloader code example
Example 1: scrapy itemloader example
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
title_selectors = response.css('h1[itemprop="headline"]::text')
if not title_selectors:
return loader.load_item()
title = title_selectors.extract()[0]
loader.add_value('title', title)
author_name_selectors = response.css('a[rel="author"] > span::text')
if not author_name_selectors:
loader.add_value('author_name', '')
else:
author_name = author_name_selectors.extract()[0]
loader.add_value('author_name', author_name)
raw_content_selectors = response.css('.content')
if not raw_content_selectors:
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
date_time_str_selectors = response.css('article > div.time::text')
if not date_time_str_selectors:
return loader.load_item()
date_time_str = date_time_str_selectors.extract()[0]
date_time_str = date_time_str.split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
try:
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
except ValueError:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
return loader.load_item()
Example 2: scrapy itemloader example
def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
loader.add_css('name', u'.modTitle>h1::text')
def parse_category():
for e in response.css(u'.catType>a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('category', list(parse_category()))
loader.add_css('detail', u'.zhiyeShow')
item = loader.load_item()
return FormRequest(
url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
formdata={'code': item['code'][0]},
meta={'item': item},
dont_filter=True,
callback=self.parse_majors
)