Example 1: scrapy itemloader example
def parse_question(self, response):
if "QuestionHeader-title" in response.text:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
Example 2: scrapy itemloader example
def parse_item(self, response):
loader = ItemLoader(ChsiDaxueItem(), response)
loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml')
loader.add_value('url', response.url)
loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url)))
loader.add_css('name', u'.topImg::text')
loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)')
data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip)
loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean)
loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean)
loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
def parse_votes():
xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0)
return {
'overall': get_vote(u'?????'),
'environment': get_vote(u'???????'),
'life': get_vote(u'?????'),
}
loader.add_value('votes', parse_votes())
def parse_trending():
css = u'{}>table tr:not(:first-child)'
def get_trending(what):
majors = []
for e in response.css(css.format(what)):
majors.append({
'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'),
'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(),
'vote': float(e.css(u'.avg_rank::text').extract_first()),
'count': int(e.css(u'.c_f00::text, .red::text').extract_first()),
})
return majors
return {
'count': get_trending(u'#topNoofPTable'),
'index': get_trending(u'#topIndexTable'),
'like': get_trending(u'.r_r_box_zymyd'),
}
loader.add_value('trending', parse_trending())
item = loader.load_item()
for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response):
yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)