On Sun, 31 Jan 2016 05:29 pm, Veek. M wrote: > I'm using lxml.html
Hmmm. Well, I've never used lxml, but the first obvious problem I see is that your lines: description = li_item.find_class('vip')[0].text_content() link = li_item.find_class('vip')[0].get('href') price_dollar = li_item.find_class('lvprice prc')[0].xpath('span')[0].text bids = li_item.find_class('lvformat')[0].xpath('span')[0].text look suspiciously like a violation of the Liskov Substitution Principle. ("Talk to your dog, not to the dog's legs!") A long series of chained dot accesses (or equivalent getitem, call, getitem, dot, etc) is a code-smell suggesting that you are trying to control your dog's individual legs, instead of just calling the dog. But, I'll assume that this is part of the design of lxml, and so allowed. So let's refactor by adding some helper methods and tidying the parse_page method. This will also make it easier to test, refactor and maintain the code, especially if the format of the XML file changes. def extract(self, item, clsname, extractor, default="unknown"): """Return the class of item, or default if unknown.""" try: cls = item.find_class(clsname) except lxml.ClassNotFoundError: # what should this be? return default return extractor(cls) def get_time(self, clsname, default='No time found'): extractor = lambda obj: obj[0].xpath('span')[0].get('timems') t = self.extract(li_item, clsname, extractor, None) if t is None: return default return int(t)/1000 - time.time() def parse_page(self, root): for li_item in root.xpath( '//li[re:test(@id, "^item[a-z0-9]+$")]', namespaces={'re': "http://exslt.org/regular-expressions"} ): description = self.extract(li_item, 'vip', lambda obj: obj[0].text_content(), "no description") link = self.extract(li_item, 'vip', lambda obj: obj[0].get('href')) price_dollar = self.extract(li_item, 'lvprice prc', lambda obj: obj[0].xpath('span')[0].text) bids = self.extract(li_item, 'lvformat', lambda obj: obj[0].xpath('span')[0].text) time_hrs = self.get_time('tme') shipping = self.extract(li_item, 'lvshipping', lambda obj: obj[0].xpath( 'span/span/span')[0].text_content() ) print('{} {} {} {} {}'.format( link, price_dollar, time_hrs, shipping, bids)) print('-'*70) ####################### If you prefer a more Java-style object-oriented solution: def get_class(self, item, clsname): """Return the class of item, or None if unknown.""" try: return item.find_class(clsname) except lxml.ClassNotFoundError: # what should this be? return None def get_description(self, maybe_cls, default="unknown"): if maybe_cls is None: return default return maybe_cls[0].text_content() def get_link(self, maybe_cls, tag='href', default='none'): if maybe_cls is None: return default return maybe_cls[0].get(tag) def get_text(self, maybe_cls, default='unknown'): if maybe_cls is None: return default return maybe_cls[0].xpath('span')[0].text def get_time(self, maybe_cls, default='No time found'): if maybe_cls is None: return default t = maybe_cls[0].xpath('span')[0].get('timems') return int(t)/1000 - time.time() def get_shipping(self, maybe_cls, default='unknown shipping'): if maybe_cls is None: return default return maybe_cls[0].xpath('span/span/span')[0].text_content() def parse_page(self, root): for li_item in root.xpath( '//li[re:test(@id, "^item[a-z0-9]+$")]', namespaces={'re': "http://exslt.org/regular-expressions"} ): description = self.get_description( self.get_class(li_item, 'vip'), "no description") link = self.get_link(self.get_class(li_item, 'vip')) price_dollar = self.get_text( self.get_class(li_item, 'lvprice prc')) bids = self.get_text( self.get_class(li_item, 'lvformat') time_hrs = self.get_time(self.get_class(li_item, 'tme')) shipping = self.get_shipping( self.get_class(li_item, 'lvshipping') print('{} {} {} {} {}'.format( link, price_dollar, time_hrs, shipping, bids)) print('-'*70) Obviously I haven't tested this code. -- Steven -- https://mail.python.org/mailman/listinfo/python-list