scrapy spider loader code example
Example 1: scrapy itemloader example
def parse(self, response):
l = ItemLoader(item=Area(), response=response)
l.add_value('id', parse_qs(response.xpath('//div[@class="clearfix subnav level-1"]//li//a[2]/@href').extract()[0])['area_id'][0])
l.add_xpath('name', '//div[@class="clearfix subnav level-1"]//li//a[2]/text()')
l.add_value('updated', datetime.utcnow().isoformat())
return l.load_item()
Example 2: scrapy itemloader example
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
parsed_news = json.loads(str(response.body))[0]
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', parsed_news['url'])
if not parsed_news['title']:
return loader.load_item()
loader.add_value('title', parsed_news['title'])
html_response = HtmlResponse(url=parsed_news['url'],
body=parsed_news['content'].encode('utf-8', 'ignore'))
xpath_query = '''
//body/node()
[not(descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::image|
descendant-or-self::img|
descendant-or-self::iframe
)]
'''
raw_content_selectors = html_response.xpath(xpath_query)
if not raw_content_selectors:
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
if not parsed_news['published']:
return loader.load_item()
date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
try:
published_at_wib = datetime.strptime(date_time_str,
'%d %b %Y - %H:%M')
except ValueError:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
if not parsed_news['author']:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', parsed_news['author'])
return loader.load_item()