itemprocessor scrapy code example
Example 1: scrapy itemloader example
def _extract_item(self, response):
#???????
l = ItemLoader(response=response, item=MyspiderItem(), type='html')
l.add_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()')
l.add_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()')
l.add_xpath('movie_type', '//span[@property="v:genre"]/text()')
l.add_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()')
l.add_value('url', response.url)
#????????????load_item()????scrapy.Item??
#?scrapy-redis????json?item???????redis?item???
#??json?????python?????????????item?????
return dict(l.load_item())
Example 2: scrapy itemloader example
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
loader = ItemLoader(item=News(), response=response)
json_response = json.loads(response.body)
try:
url = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['MoreLink']
except KeyError:
return loader.load_item()
loader.add_value('url', url)
try:
title = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['HeadLine']
except KeyError:
return loader.load_item()
if not title:
return loader.load_item()
loader.add_value('title', title)
try:
raw_content = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['ContentItem']['DataContent']['nitf']['body']['body.content']['p']
except KeyError:
return loader.load_item()
if not raw_content:
return loader.load_item()
loader.add_value('raw_content', raw_content)
try:
author_name = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['Author']
except KeyError:
return loader.load_item()
if not author_name:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', author_name)
try:
date_time_str = json_response['NewsML']['NewsItem']['NewsManagement']['FirstCreated']
except KeyError:
return loader.load_item()
if not date_time_str:
return loader.load_item()
date_time_str = date_time_str.split('T')
date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1]
try:
published_at_wib = datetime.strptime(' '.join(date_time_str), '%Y%m%d %H%M%S');
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
return loader.load_item()