item loader scrapy code example

Example 1: scrapy itemloader example

def parse_question(self, response):
        # ??question??? ??????????question item
        if "QuestionHeader-title" in response.text:
            # ?????
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            # ????????item??
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)

            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath("title",
                                  "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath("watch_user_num",
                                  "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
                             callback=self.parse_answer)
        yield question_item

Example 2: scrapy itemloader example

def parse_page(self, response):
		#????
		# print u'~~~~', unicode(response.body, "gbk").encode("utf8")
		# print(self.config["xpathImagesPath"])
		# print(response.xpath(self.config["xpathImagesPath"]))
		l = ItemLoader(item=PageItem(), response=response)
		l.add_value('title', response.request.cookies['title'])
		l.add_value('name', self.config["id"])
		l.add_value('url', response.url)
		if self.config.has_key("imageUrlReplacement"):
			l.add_value('replace', self.config["imageUrlReplacement"])
			
		if self.config.has_key("xpathImagesPath"):
			l.add_xpath('image_urls', self.config["xpathImagesPath"])
		if self.config.has_key("xpathFilesPath"):
			l.add_xpath('file_urls', self.config["xpathFilesPath"])
		yield l.load_item()
		
		#TODO??????????????parse_page
		if self.config.has_key("xpathNextImageUrl"):
			nextUrls = response.xpath(self.config["xpathNextImageUrl"])
			if len(nextUrls) > 0:
				nextPage = nextUrls.extract()[0]
				if not nextPage.startswith("http"):
					if nextPage.startswith("/"):
						nextPage = response.url[0:response.url.index("/",10)+1]+nextPage 
					else:
						nextPage = response.url[0:response.url.rfind("/")+1]+nextPage 
				request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']})
				yield request

item loader scrapy code example

Example 1: scrapy itemloader example

Example 2: scrapy itemloader example

Tags:

Python Example

Related

Recent Posts