itemloader.processors.Take First code example

Example 1: scrapy itemloader example

def parse(self, response):
        try:
            l = ItemLoader(item=MovieItem(), response=response)
            l.add_value('name',
                        response.css('div#content h1 [property="v:itemreviewed"]::text').extract_first().strip())
            year = response.css('div#content h1 span.year::text').extract_first()
            if year.startswith('('):
                year = year[1:-1]
            l.add_value('year', year)

            newStrL = []
            for val in response.css('div#info::text').extract():
                newStr = val.strip().strip('/')
                if newStr != '':
                    newStrL.append(newStr)
                    if len(newStrL) == 2:
                        break

            if len(newStrL) == 2:
                l.add_value('region', newStrL[0].split('/'))
                l.add_value('language', newStrL[1].split('/'))

            l.add_value('duration', response.css('div#info [property="v:runtime"]::attr(content)').extract_first())
            l.add_value('types', response.css('div#info [property="v:genre"]::text').extract())
            l.add_value('directors', response.css('div#info [rel="v:directedBy"]::text').extract())
            l.add_value('actors', response.css('div#info [rel="v:starring"]::text').extract())
            l.add_value('runtime', response.css('div#info [property="v:initialReleaseDate"]::text').extract())
            l.add_value('detailurl', response.url)
            l.add_value('IMDburl', response.css('div#info [rel="nofollow"]::attr(href)').extract())
            l.add_value('stars', response.css('strong[property="v:average"]::text').extract_first())
            return l.load_item()
        except Exception:
            pass

Example 2: scrapy itemloader example

def parse_item(self, response):

        loader = ItemLoader(ChsiDaxueItem(), response)
        loader.add_value('id', response.url, re=ur'schId-(\w+)\.dhtml')
        loader.add_value('url', response.url)
        loader.add_css('logo', u'.r_c_sch_logo>img::attr(src)', MapCompose(lambda url: urljoin('http://gaokao.chsi.com.cn/', url)))
        loader.add_css('name', u'.topImg::text')
        loader.add_css('badges', u'.r_c_sch_attr .r_c_sch_icon::attr(title)')

        data_clean = MapCompose(lambda x: re.sub(r'\s+', ' ', x), unicode.strip)
        loader.add_xpath('type', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('membership', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('province', u'//span[@class="f_bold" and span]/following-sibling::text()', data_clean)
        loader.add_xpath('address', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('phone', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)
        loader.add_xpath('website', u'//span[@class="f_bold" and .="?????"]/following-sibling::a/@href', data_clean)
        loader.add_xpath('backdoor', u'//span[@class="f_bold" and .="?????"]/following-sibling::text()', data_clean)

        def parse_votes():
            xpath = u'//td[@class="tdMydT" and .="{}"]/following-sibling::td/div[@class="rank"]/@rank'
            get_vote = lambda what: float(response.xpath(xpath.format(what)).extract_first() or 0)
            return {
                'overall': get_vote(u'?????'),
                'environment': get_vote(u'???????'),
                'life': get_vote(u'?????'),
            }

        loader.add_value('votes', parse_votes())

        def parse_trending():
            css = u'{}>table tr:not(:first-child)'
            def get_trending(what):
                majors = []
                for e in response.css(css.format(what)):
                    majors.append({
                        'id': e.css(u'.tdZytjTDiv>a::attr(href)').re_first(r'specId=(\w+)'),
                        'name': e.css(u'.tdZytjTDiv::attr(title)').extract_first(),
                        'vote': float(e.css(u'.avg_rank::text').extract_first()),
                        'count': int(e.css(u'.c_f00::text, .red::text').extract_first()),
                    })
                return majors
            return {
                'count': get_trending(u'#topNoofPTable'),
                'index': get_trending(u'#topIndexTable'),
                'like': get_trending(u'.r_r_box_zymyd'),
            }

        loader.add_value('trending', parse_trending())

        item = loader.load_item()

        for link in LinkExtractor(restrict_xpaths=u'//ul[@id="topNav"]//a[.="????"]').extract_links(response):
            yield Request(link.url, meta={'item': item}, callback=self.parse_jianjie)