import urllib
from HTMLParser import HTMLParser

import settings

TYPE_STARTTAG = 1
TYPE_STARTENDTAG = 2
TYPE_ENDTAG = 3
TYPE_DATA = 4


## Finite State Machine...

class _BaseState(object):
    @staticmethod
    def enter(self):
        pass
    @staticmethod
    def start_scrape(self):
        pass
    @staticmethod
    def end_scrape(self):
        pass
    @staticmethod
    def think(self, TYPE_, tagordata, attrs=[]):
        pass
    @staticmethod
    def exit(self):
        pass

class ScanningState(_BaseState):
    @staticmethod
    def think(self, TYPE_, tagordata, attrs=[]):
        if self.start(TYPE_, tagordata, attrs):
            return ScrapingState

class ScrapingState(_BaseState):
    @staticmethod
    def think(self, TYPE_, tagordata, attrs=[]):
        if self.stop(TYPE_, tagordata, attrs):
            return FinishedState
        else:
            self.keep(TYPE_, tagordata, self.data)

class FinishedState(_BaseState):
    @staticmethod
    def think(self, TYPE_, tagordata, attrs=[]):
        # Get rid of trailing blank line.
        if self.data[-1].rstrip() == '':
            del self.data[-1]
        return _BaseState


## Scraper parent class

class HTMLScraper(HTMLParser):
    
    def __init__(self):
        HTMLParser.__init__(self)
        self.data = []
        self.current_state = None
        
    def start(self, TYPE_, tagordata, attrs=[]):
        """Override this in subclass."""
        pass
    
    def stop(self, TYPE_, tagordata, attrs=[]):
        """Override this in subclass."""
        pass
    
    def keep(self, TYPE_, tagordata, data_array):
        """Override this in subclass."""
        pass
    
    def feed(self, html):
        self.data[:] = ['']
        self.current_state = _BaseState
        self.set_state(ScanningState)
        HTMLParser.feed(self, html)
    
    def handle_starttag(self, tag, attrs):
        self.think(TYPE_STARTTAG, tag, attrs)
    
    def handle_startendtag(self, tag, attrs):
        self.think(TYPE_STARTENDTAG, tag, attrs)
    
    def handle_endtag(self, tag):
        self.think(TYPE_ENDTAG, tag)
    
    def handle_data(self, data):
        data = data.rstrip('\n')
        self.think(TYPE_DATA, data)
    
    def set_state(self, new_state):
        if new_state:
            if __debug__: print '%s: %s -> %s' % (self.__class__.__name__,
                self.current_state.__name__, new_state.__name__
            )
            self.current_state.exit(self)
            self.current_state = new_state
            self.current_state.enter(self)
    
    def think(self, TYPE_, tagordata, attrs=[]):
        ## These args come from the handle_* methods, which are invoked by the
        ## parent class's feed() method.
        new_state = self.current_state.think(self, TYPE_, tagordata, attrs=[])
        if new_state:
            self.set_state(new_state)


## Custom scraper for the Simple Free Online English Language Dictionary

class SFOELD(HTMLScraper):
    """A scraper written for the Simple Free Online English Language Dictionary
    at http://www.online-utility.org/english/dictionary.jsp.
    """
    
    def __init__(self):
        HTMLScraper.__init__(self)
        self.url = settings.online_dictionary
    
    def lookup(self, word):
        try:
            self.word = word
            query = 'word='+word
            conn = urllib.urlopen('%s?%s' % (self.url,query))
            html = conn.read()
            conn.close()
            self.feed(html)
        except:
            pass
    
    def start(self, TYPE_, tagordata, attrs=[]):
#        if TYPE_ == TYPE_ENDTAG:
#            if tagordata.lower() == 'h4':
#                return True
        if TYPE_ == TYPE_DATA:
            if 'meaning of '+self.word in tagordata.lower():
                return True
        return False
    
    def stop(self, TYPE_, tagordata, attrs=[]):
        if TYPE_ == TYPE_STARTENDTAG:
            if tagordata.lower() == 'p':
                return True
        return False
    
    def keep(self, TYPE_, tagordata, data_array):
        if TYPE_ == TYPE_STARTENDTAG:
            if tagordata == 'br':
                data_array.append('')
        elif TYPE_ == TYPE_DATA:
            data_array[-1] += tagordata
    
if __name__ == '__main__':

    ## change the index: 0=URL, 1=local file.
    html_file = ('', '../test_html.html')[1]
    url = 'http://www.online-utility.org/english/dictionary.jsp'
    keyword = 'box'
    
    if html_file:
        fh = open(html_file)
        html = fh.read()
        fh.close()
    else:
        query = 'word='+keyword
        conn = urllib.urlopen('%s?%s' % (url,query))
        html = conn.read()
        conn.close()
    
    html_parser = SFOELD()
    html_parser.word = keyword
    html_parser.feed(html)
    
    print '\n'.join(html_parser.data)
