scrapy spider loader code example

Example 1: scrapy itemloader example

def parse(self, response):
        l = ItemLoader(item=Area(), response=response)
        l.add_value('id', parse_qs(response.xpath('//div[@class="clearfix subnav level-1"]//li//a[2]/@href').extract()[0])['area_id'][0])
        l.add_xpath('name', '//div[@class="clearfix subnav level-1"]//li//a[2]/text()')
        l.add_value('updated', datetime.utcnow().isoformat()) # you can also use literal values
        return l.load_item()
        #self.log('URL: {}'.format(response.url))

Example 2: scrapy itemloader example

def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)
        parsed_news = json.loads(str(response.body))[0]

        # Initialize item loader
        # extract news title, published_at, author, content, url
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', parsed_news['url'])

        if not parsed_news['title']:
            # Will be dropped on the item pipeline
            return loader.load_item()
        loader.add_value('title', parsed_news['title'])

        # Convert HTML text to a scrapy response
        html_response = HtmlResponse(url=parsed_news['url'],
                body=parsed_news['content'].encode('utf-8', 'ignore'))
        xpath_query = '''
            //body/node()
                [not(descendant-or-self::comment()|
                    descendant-or-self::style|
                    descendant-or-self::script|
                    descendant-or-self::div|
                    descendant-or-self::span|
                    descendant-or-self::image|
                    descendant-or-self::img|
                    descendant-or-self::iframe
                )]
        '''
        raw_content_selectors = html_response.xpath(xpath_query)
        if not raw_content_selectors:
            # Will be dropped on the item pipeline
            return loader.load_item()
        raw_content = raw_content_selectors.extract()
        raw_content = ' '.join([w.strip() for w in raw_content])
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        if not parsed_news['published']:
            # Will be dropped on the item pipeline
            return loader.load_item()

        # Parse date information
        # Example: 12 Oct 2016 - 05:25
        date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')])
        try:
            published_at_wib = datetime.strptime(date_time_str,
                    '%d %b %Y - %H:%M')
        except ValueError:
            # Will be dropped on the item pipeline
            return loader.load_item()
        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        if not parsed_news['author']:
            loader.add_value('author_name', '')
        else:
            loader.add_value('author_name', parsed_news['author'])

        # Move scraped news to pipeline
        return loader.load_item()

scrapy spider loader code example

Example 1: scrapy itemloader example

Example 2: scrapy itemloader example

Tags:

Python Example

Related

Recent Posts