python - Scrapy Spider cannot Extract contents of web page using xpath -
i have scrapy spider , using xpath selectors extract contents of page,kindly check going wrong
from scrapy.contrib.loader import itemloader scrapy.contrib.spiders import crawlspider,rule scrapy.selector import htmlxpathselector medicalproject.items import medicalprojectitem scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.selector import htmlxpathselector scrapy import request class myspider(crawlspider): name = "medical" allowed_domains = ["yananow.org"] start_urls = ["http://yananow.org/query_stories.php"] rules = ( rule(sgmllinkextractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=true), ) def parse_items(self, response): hxs = htmlxpathselector(response) titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td') items = [] title in titles: item = medicalprojectitem() item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract() item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract() items.append(item) return(items)
there lot of issues code here different approach.
i opted against crawlspider
have more control on scraping process. grabbing name
query page , story detail page.
i tried simplify xpath
statements not diving (nested) table structures looking patterns of content. if want extract story ... there must link story.
here comes tested code (with comments):
# -*- coding: utf-8 -*- import scrapy class myitem(scrapy.item): name = scrapy.field() story = scrapy.field() class myspider(scrapy.spider): name = 'medical' allowed_domains = ['yananow.org'] start_urls = ['http://yananow.org/query_stories.php'] def parse(self, response): rows = response.xpath('//a[contains(@href,"display_story")]') #loop on links stories row in rows: myitem = myitem() # create new item myitem['name'] = row.xpath('./text()').extract() # assign name link story_url = response.urljoin(row.xpath('./@href').extract()[0]) # extract url link request = scrapy.request(url = story_url, callback = self.parse_detail) # create request detail page story request.meta['myitem'] = myitem # pass item request yield request def parse_detail(self, response): myitem = response.meta['myitem'] # extract item (with name) response text_raw = response.xpath('//font[@size=3]//text()').extract() # extract story (text) myitem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean text , assign item yield myitem # return item
Comments
Post a Comment