building web scraper use internally on sites manage , having issues outputting xpath lists happen include commas in output string. know need handle commas inside string lists differently commas separate values meant separated columns
# -*- coding: utf-8 -*- import requests lxml import html import urlparse import collections import csv import time # settings statingurl = 'http://www.somdomain.com' domain = 'somedomain' # filename timestr = time.strftime("%m-%d-%y-%h-%m-%s") f = open('scrape-output\\'+domain+'-metadata-'+timestr+'.csv', 'a+') # create url queue, set start, crawl urls_queue = collections.deque() urls_queue.append(statingurl) found_urls = set() found_urls.add(statingurl) # set column headers file colheader = "url crawled, title tag, meta description, h1, h2, h3, h4, h5, h6, image source, image alt" f.write(colheader) f.write("\n") while len(urls_queue): url = urls_queue.popleft() page_url = url print "\n" print "************************************************************" print "\n" # use requests metadata if url.startswith(statingurl): print "connecting %s" % (url,) page = requests.get(url) tree = html.fromstring(page.content) print "\n" # extract metadata elements html tree title = tree.xpath('//title/text()') description = tree.xpath("//head/meta[@name='description']/@content") h1 = tree.xpath('//h1/text()') h2 = tree.xpath('//h2/text()') h3 = tree.xpath('//h3/text()') h4 = tree.xpath('//h4/text()') h5 = tree.xpath('//h5/text()') h6 = tree.xpath('//h6/text()') imgsrc = tree.xpath('//img/@src') imgalt = tree.xpath('//img/@alt') # output metadata print 'found %s title' % len(title) print title,"\n" print 'found %s description' % len(description) print description,"\n" print 'found %s h1' % len(h1) print h1 print 'found %s h2' % len(h2) print h2 print 'found %s h3' % len(h3) print h3 print 'found %s h4' % len(h4) print h4 print 'found %s h5' % len(h5) print h5 print 'found %s h6' % len(h6) print h6 print '\n' print 'found %s image paths' % len(imgsrc) print 'images src:' print imgsrc print "\n" print 'found %s image alt tags' % len(imgsrc) print 'image alt:' print imgalt print "\n" # finds links on page; add url queue print "looking links" links = {urlparse.urljoin(page.url, url) url in tree.xpath('//a/@href') if urlparse.urljoin(page.url, url).startswith('http')} print "set difference find new urls" # set difference find new urls link in (links - found_urls): found_urls.add(link) urls_queue.append(link) print '\n %s urls in queue' % len(urls_queue) # write output file , repeat loop output = "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s" % (page_url.encode('utf-8'), title, description, h1, h2, h3, h4, h5, h6, imgsrc, imgalt) f.write(output) f.write('\n')
if me understand description value how ensure comma in description object parsed string complete comma in it, csv file, appreciate it. there more work on script past that, immediate issue.
thanks.
use in code:
import csv headers = ["url crawled", "title tag", "meta description", "h1", "h2", "h3", "h4", "h5", "h6", "image source", "image alt"] f = open('file.csv', 'ab') writer = csv.writer(f) writer.writerow(headers) writer.writerow(["some", "data, here"])
also, parsing web, better use unicodecsv
module handling unicode content. have install using pip doing pip install unicodecsv
.
unicodecsv
has same functions csv
module.
once you've installed unicodecsv
module have replace
import csv
by
import unicodecsv
and should work better.
Comments
Post a Comment