i'm trying use code list of urls in sitemap. when run this, see no results in screen. tell me whats problem or suggest me better 1 example. in advance
class myspider(sitemapspider): name = "xyz" allowed_domains = ["xyz.nl"] sitemap_urls = ["http://www.xyz.nl/sitemap.xml"] def parse(self, response): print response.url return request(response.url, callback=self.parse_sitemap_url) def parse_sitemap_url(self, response): # stuff sitemap links
this spider urls sitemap , save them list. can change output file or console.
# -*- coding: utf-8 -*- import scrapy scrapy.spiders import sitemapspider scrapy.spiders import spider scrapy.http import request, xmlresponse scrapy.utils.sitemap import sitemap, sitemap_urls_from_robots scrapy.utils.gz import gunzip, is_gzipped import re import requests class getpagesfromsitemapspider(sitemapspider): name = "test" handle_httpstatus_list = [404] def parse(self, response): print response.url def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): url in sitemap_urls_from_robots(response.body): yield request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body none: self.logger.info('ignoring invalid sitemap: %s', response.url) return s = sitemap(body) sites = [] if s.type == 'sitemapindex': loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) x in self._follow): yield request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': loc in iterloc(s): r, c in self._cbs: if r.search(loc): sites.append(loc) break print sites def __init__(self, spider=none, *a, **kw): super(getpagesfromsitemapspider, self).__init__(*a, **kw) self.spider = spider l = [] url = "https://channelstore.roku.com" resp = requests.head(url + "/sitemap.xml") if (resp.status_code != 404): l.append(resp.url) else: resp = requests.head(url + "/robots.txt") if (resp.status_code == 200): l.append(resp.url) self.sitemap_urls = l print self.sitemap_urls def iterloc(it, alt=false): d in it: yield d['loc'] # consider alternate urls (xhtml:link rel="alternate") if alt , 'alternate' in d: l in d['alternate']: yield l
Comments
Post a Comment