scrapy debug itemloader code example

Example 1: scrapy itemloader example

def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1[itemprop="headline"]::text')
        if not title_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        title = title_selectors.extract()[0]
        loader.add_value('title', title)


        author_name_selectors = response.css('a[rel="author"] > span::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[0]
            loader.add_value('author_name', author_name)

        raw_content_selectors = response.css('.content')
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        date_time_str_selectors = response.css('article > div.time::text')
        if not date_time_str_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: Selasa,  6 Oktober 2015 - 05:23 WIB
        date_time_str = date_time_str_selectors.extract()[0]
        date_time_str = date_time_str.split(',')[1].strip()[:-4]
        date_time_str = ' '.join([_(w) for w in date_time_str.split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str, '%d %B %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # Move scraped news to pipeline
        return loader.load_item()

Example 2: scrapy itemloader example

def parse_item(self, response):

        loader = ItemLoader(GaokaopaiZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
        loader.add_css('name', u'.modTitle>h1::text')

        def parse_category():
            for e in response.css(u'.catType>a'):
                yield {
                    'url': e.css('::attr(href)').extract_first(),
                    'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
                    'name': e.css('::text').extract_first(),
                }

        loader.add_value('category', list(parse_category()))
        loader.add_css('detail', u'.zhiyeShow')

        item = loader.load_item()

        return FormRequest(
            url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
            formdata={'code': item['code'][0]},
            meta={'item': item},
            dont_filter=True,
            callback=self.parse_majors
        )

scrapy debug itemloader code example

Example 1: scrapy itemloader example

Example 2: scrapy itemloader example

Tags:

Python Example

Related

Recent Posts