How to write python scrapy code for extracting url's present in sitemap of a site -

i'm trying use code list of urls in sitemap. when run this, see no results in screen. tell me whats problem or suggest me better 1 example. in advance

class myspider(sitemapspider): name = "xyz" allowed_domains = ["xyz.nl"] sitemap_urls = ["http://www.xyz.nl/sitemap.xml"]   def parse(self, response):     print response.url     return request(response.url, callback=self.parse_sitemap_url)  def parse_sitemap_url(self, response):     # stuff sitemap links

this spider urls sitemap , save them list. can change output file or console.

# -*- coding: utf-8 -*- import scrapy scrapy.spiders import sitemapspider scrapy.spiders import spider scrapy.http import request, xmlresponse scrapy.utils.sitemap import sitemap, sitemap_urls_from_robots scrapy.utils.gz import gunzip, is_gzipped import re import requests  class getpagesfromsitemapspider(sitemapspider):     name = "test"     handle_httpstatus_list = [404]      def parse(self, response):        print response.url      def _parse_sitemap(self, response):         if response.url.endswith('/robots.txt'):             url in sitemap_urls_from_robots(response.body):                 yield request(url, callback=self._parse_sitemap)         else:             body = self._get_sitemap_body(response)             if body none:                 self.logger.info('ignoring invalid sitemap: %s', response.url)                 return              s = sitemap(body)             sites = []             if s.type == 'sitemapindex':                 loc in iterloc(s, self.sitemap_alternate_links):                     if any(x.search(loc) x in self._follow):                         yield request(loc, callback=self._parse_sitemap)             elif s.type == 'urlset':                 loc in iterloc(s):                     r, c in self._cbs:                         if r.search(loc):                             sites.append(loc)                             break             print sites      def __init__(self, spider=none, *a, **kw):             super(getpagesfromsitemapspider, self).__init__(*a, **kw)             self.spider = spider             l = []             url = "https://channelstore.roku.com"             resp = requests.head(url + "/sitemap.xml")             if (resp.status_code != 404):                 l.append(resp.url)             else:                 resp = requests.head(url + "/robots.txt")                 if (resp.status_code == 200):                     l.append(resp.url)             self.sitemap_urls = l             print self.sitemap_urls  def iterloc(it, alt=false):     d in it:         yield d['loc']          # consider alternate urls (xhtml:link rel="alternate")         if alt , 'alternate' in d:             l in d['alternate']:                 yield l

Shah

Search This Blog

How to write python scrapy code for extracting url's present in sitemap of a site -

Comments

Post a Comment