Example 1: scrapy itemloader example
def parse_question(self, response):
if "QuestionHeader-title" in response.text:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
Example 2: scrapy itemloader example
def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
parsed_news = json.loads(str(response.body))[0]
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', parsed_news['url'])
if not parsed_news['title']:
return loader.load_item()
loader.add_value('title', parsed_news['title'])
html_response = HtmlResponse(url=parsed_news['url'],
body=parsed_news['content'].encode('utf-8', 'ignore'))
xpath_query = '''
//body/node()
[not(descendant-or-self::comment()|
descendant-or-self::style|
descendant-or-self::script|
descendant-or-self::div|
descendant-or-self::span|
descendant-or-self::image|
descendant-or-self::img|
descendant-or-self::iframe
)]
'''
raw_content_selectors = html_response.xpath(xpath_query)
if not raw_content_selectors:
return loader.load_item()
raw_content = raw_content_selectors.extract()
raw_content = ' '.join([w.strip() for w in raw_content])
raw_content = raw_content.strip()
loader.add_value('raw_content', raw_content)
if not parsed_news['published']:
return loader.load_item()
date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
try:
published_at_wib = datetime.strptime(date_time_str,
'%d %b %Y - %H:%M')
except ValueError:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
if not parsed_news['author']:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', parsed_news['author'])
return loader.load_item()
Example 3: scrapy itemloader example
def _extract_item(self, response):
l = ItemLoader(response=response, item=MyspiderItem(), type='html')
l.add_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()')
l.add_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()')
l.add_xpath('movie_type', '//span[@property="v:genre"]/text()')
l.add_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()')
l.add_value('url', response.url)
return dict(l.load_item())
Example 4: scrapy itemloader example
def parse_news_metro(self, response):
loader = ItemLoader(item=News(), response=response)
loader.add_value('url', response.url)
date_selector = response.css('.artikel > div.block-tanggal::text')
if not date_selector:
return self.parse_news_pilkada(loader, response)
try:
date_time_str = date_selector.extract()[0].split(',')[1].strip()[:-4]
date_time_str = ' '.join([_(x) for x in date_time_str.split(' ')])
published_at_wib = datetime.strptime(date_time_str, '%d %B %Y | %H:%M')
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
if (self.media['last_scraped_at'] >= published_at):
is_no_update = True
self.logger.info('Media have no update')
raise CloseSpider('finished')
loader.add_value('published_at', published_at)
title_selector = response.css('.artikel > h1::text')
if not title_selector:
return loader.load_item()
loader.add_value('title', title_selector.extract()[0])
raw_content_selector = response.xpath('//div[@class="artikel"]//p[not(iframe)]')
if not raw_content_selector:
return loader.load_item()
raw_content = ''
for rsl in raw_content_selector:
raw_content = raw_content + rsl.extract().strip()
next_page_selector = response.css('.pagination-nb').xpath('//a[text()="next"]/@href')
if next_page_selector:
return Request(next_page_selector.extract()[0], callback=lambda x, loader=loader, raw_content=raw_content: self.parse_next_page_metro(x, loader, raw_content))
loader.add_value('raw_content', raw_content)
author_name = ''
for author_name_selector in reversed(raw_content_selector):
author_name_selector = author_name_selector.css('strong::text')
for tmp in reversed(author_name_selector.extract()):
tmp = tmp.strip()
if tmp and all((x.isalpha() and x.isupper()) or x.isspace() or x == '.' or x == '|' for x in tmp):
author_name = tmp
break
if author_name:
break
author_name = ','.join(author_name.split(' | '))
loader.add_value('author_name', author_name)
return loader.load_item()
Example 5: scrapy itemloader example
def parse(self, response):
for outer in response.css('#comapreTable tr:not(:first-child)'):
if outer.css('td[align="center"]'):
ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
cname = outer.css('td[align="center"]>a::text').extract_first()
for inner in outer.xpath('td[div[@align="left"]/a]'):
loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
loader.add_value('ccode', ccode)
loader.add_value('cname', cname)
loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
loader.add_css('name', 'a::text', MapCompose(unicode.strip))
item = loader.load_item()
yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)