Issues Writing xpath objects with commas out to CSV in Python -

building web scraper use internally on sites manage , having issues outputting xpath lists happen include commas in output string. know need handle commas inside string lists differently commas separate values meant separated columns

# -*- coding: utf-8 -*- import requests lxml import html import urlparse import collections import csv import time  # settings statingurl = 'http://www.somdomain.com' domain = 'somedomain'  # filename timestr = time.strftime("%m-%d-%y-%h-%m-%s") f = open('scrape-output\\'+domain+'-metadata-'+timestr+'.csv', 'a+')  # create url queue, set start, crawl urls_queue = collections.deque() urls_queue.append(statingurl) found_urls = set() found_urls.add(statingurl)  # set column headers file colheader = "url crawled, title tag, meta description, h1, h2, h3, h4, h5, h6, image source, image alt" f.write(colheader) f.write("\n")  while len(urls_queue):     url = urls_queue.popleft()     page_url = url     print "\n"     print "************************************************************"     print "\n"      # use requests metadata     if url.startswith(statingurl):         print "connecting %s" % (url,)         page = requests.get(url)         tree = html.fromstring(page.content)         print "\n"      # extract metadata elements html tree     title = tree.xpath('//title/text()')     description = tree.xpath("//head/meta[@name='description']/@content")     h1 = tree.xpath('//h1/text()')     h2 = tree.xpath('//h2/text()')     h3 = tree.xpath('//h3/text()')     h4 = tree.xpath('//h4/text()')     h5 = tree.xpath('//h5/text()')     h6 = tree.xpath('//h6/text()')     imgsrc = tree.xpath('//img/@src')     imgalt = tree.xpath('//img/@alt')      # output metadata     print 'found %s title' % len(title)      print title,"\n"     print 'found %s description' % len(description)     print description,"\n"       print 'found %s h1' % len(h1)        print h1     print 'found %s h2' % len(h2)        print h2     print 'found %s h3' % len(h3)        print h3     print 'found %s h4' % len(h4)        print h4     print 'found %s h5' % len(h5)        print h5     print 'found %s h6' % len(h6)        print h6         print '\n'     print 'found %s image paths' % len(imgsrc)     print 'images src:'     print imgsrc      print "\n"     print 'found %s image alt tags' % len(imgsrc)        print 'image alt:'     print imgalt     print "\n"      # finds links on page; add url queue     print "looking links"     links = {urlparse.urljoin(page.url, url) url in tree.xpath('//a/@href') if urlparse.urljoin(page.url, url).startswith('http')}      print "set difference find new urls"     # set difference find new urls     link in (links - found_urls):         found_urls.add(link)         urls_queue.append(link)      print '\n %s urls in queue' % len(urls_queue)      # write output file , repeat loop     output = "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s" % (page_url.encode('utf-8'), title, description, h1, h2, h3, h4, h5, h6, imgsrc, imgalt)     f.write(output)     f.write('\n')

if me understand description value how ensure comma in description object parsed string complete comma in it, csv file, appreciate it. there more work on script past that, immediate issue.

thanks.

use in code:

import csv headers = ["url crawled", "title tag", "meta description", "h1",            "h2", "h3", "h4", "h5", "h6", "image source", "image alt"] f = open('file.csv', 'ab') writer = csv.writer(f) writer.writerow(headers) writer.writerow(["some", "data, here"])

also, parsing web, better use unicodecsv module handling unicode content. have install using pip doing pip install unicodecsv.

unicodecsv has same functions csv module.

once you've installed unicodecsv module have replace

import csv

import unicodecsv

and should work better.

Shah

Search This Blog

Issues Writing xpath objects with commas out to CSV in Python -

Comments

Post a Comment