#!/usr/bin/env python

import sys
import string
from xml.etree import ElementTree

# tags expected in vocab.xml
NODE_SECTION = "section"
SECTION_ATTR_INDEX = "index"
SECTION_ATTR_TITLE = "title"

NODE_META = "meta"
META_ATTR_KEY = "key"
META_ATTR_VALUE = "value"

NODE_VOCAB = "vocab"
VOCAB_ATTR_ENGLISH = "english"
VOCAB_ATTR_JAPANESE = "japanese"
VOCAB_ATTR_TEXT = "text"

class Vocab:
  """
  Matching of English and Japanese words. There can be multiple translations 
  on either side of the mapping. For instance "すみません" means both 
  "excuse me" and "I'm sorry" while "はい" and "ええ" both mean "yes". Fields 
  include the following:
  english - list of English meanings
  japanese - list of Japanese meanings
  meta - metadata mappings
  
  Both English and Japanese vocab are in a VocabEntry wrapper with two fields:
  text - string of the word
  meta - metadata mappings
  """
  
  def __init__(self, vocabNode):
    assert vocabNode.tag == NODE_VOCAB, "invalid node: " + vocabNode.tag
    self.english = []
    self.japanese = []
    self.meta = {}
    
    # tries to retrieve mapping as an attribute
    englishAttr = vocabNode.get(VOCAB_ATTR_ENGLISH)
    if englishAttr: self.english.append(Vocab.VocabEntry(englishAttr, {}))
    
    japaneseAttr = vocabNode.get(VOCAB_ATTR_JAPANESE)
    if japaneseAttr: self.japanese.append(Vocab.VocabEntry(japaneseAttr, {}))
    
    for child in vocabNode:
      if child.tag == NODE_META:
        key = child.get(META_ATTR_KEY)
        value = child.get(META_ATTR_VALUE)
        self.meta[key] = value
      elif child.tag == VOCAB_ATTR_ENGLISH:
        self.english.append(Vocab.VocabEntry.parseVocabEntryNode(child))
      elif child.tag == VOCAB_ATTR_JAPANESE:
        self.japanese.append(Vocab.VocabEntry.parseVocabEntryNode(child))
      else:
        message = "unrecognized node in vocab entry: " + child.tag
        raise AssertionError(message)
    
    # Fails if vocab not defined for either language
    assert len(self.english) > 0, "no English translation provided for a vocab entry"
    assert len(self.japanese) > 0, "no Japanese translation provided for a vocab entry"
  
  def isSimple():
    """
    True if this represents a one-to-one mapping of English and Japanese without any defined metadata.
    """
    return len(self.meta) == 0 and len(self.english) == 1 and len(self.english[0].meta) == 0 \
      and len(self.japanese) == 1 and len(self.japanese[0].meta) == 0
  
  def __str__(self):
    rep = []
    if len(self.english) == 1: rep.append(self.english[0].text)
    else:
      rep.append("(")
      for entry in self.english:
        rep.append(entry.text)
      rep.append(")")
    
    rep.append(" ↔ ")
    
    if len(self.japanese) == 1: rep.append(self.japanese[0].text)
    else:
      rep.append("(")
      for entry in self.english:
        rep.append(entry.text)
      rep.append(")")
    return "".join(rep)
  
  class VocabEntry:
    """
    Individual English or Japanese word or phrase.
    """
    
    def __init__(self, text, metadata):
      self.text = text
      self.meta = metadata

    # facotry constructor for VocabEntry from xml node
    def parseVocabEntryNode(vocabEntryNode):
      assert vocabEntryNode.tag == VOCAB_ATTR_ENGLISH or vocabEntryNode.tag == VOCAB_ATTR_JAPANESE, "invalid node: " + vocabNode.tag
      text = vocabEntryNode.get(VOCAB_ATTR_TEXT)
      meta = {}
      
      for child in vocabEntryNode:
        if child.tag == NODE_META:
          key = child.get(META_ATTR_KEY)
          value = child.get(META_ATTR_VALUE)
          meta[key] = value
        else:
          message = "unrecognized node in vocab entry: " + child.tag
          raise AssertionError(message)
      
      return Vocab.VocabEntry(text, meta)

class Section:
  """
  Collection of vocab and subsections related by topic, chapter, etc with the following fields:
  vocab - list of vocabulary in this section
  sections - subsections with additional vocab
  title - label for section
  index - numeric index for sorting, -1 if not defined
  meta - metadata mappings
  """
  
  def __init__(self, sectionNode):
    assert sectionNode.tag == NODE_SECTION, "invalid node: " + sectionNode.tag
    
    try:
      indexAttr = sectionNode.get(SECTION_ATTR_INDEX)
      if indexAttr: self.index = int(indexAttr)
      else: self.index = -1
    except ValueError:
      raise AssertionError("section indices must be numbers")
    self.title = sectionNode.get(SECTION_ATTR_TITLE)
    
    self.sections = []
    self.meta = {}
    self.vocab = []
    
    for child in sectionNode:
      if child.tag == NODE_SECTION:
        self.sections.append(Section(child))
      elif child.tag == NODE_META:
        key = child.get(META_ATTR_KEY)
        value = child.get(META_ATTR_VALUE)
        self.meta[key] = value
      elif child.tag == NODE_VOCAB:
        self.vocab.append(Vocab(child))
      else:
        message = "unrecognized node in section '" + self.title + "': " + child.tag
        raise AssertionError(message)
    
    self.sections.sort()
  
  def __str__(self):
    rep = [str(self.index), ". ", self.title]
    indentation = "  "
    
    for vocab in self.vocab:
      rep.append("\n")
      rep.append(indentation)
      rep.append(str(vocab))
    
    # appends subsection text indented
    for subsection in self.sections:
      rep.append("\n") # blank line
      
      for line in string.split(subsection.__str__(), "\n"):
        rep.append("\n")
        rep.append(indentation) # indentation
        rep.append(line)
    
    return "".join(rep)
  
  def __cmp__(self, other):
    return cmp(self.index, other.index)

def loadVocab(path):
  """
  Parses vocabulary from a properly formatted xml file. This throws 
  an AssertionException if the file's invalid.
  """
  
  # Skipping DTD validation because no internal modules exist and a 
  # simple sanity check isn't worth complicating installation. Discussion 
  # of options can be found at:
  # http://www.programmingtalk.com/archive/index.php/%20%3C/t-30347.html
  
  try:
    rootNode = ElementTree.parse(path)
    return Section(rootNode.getroot())
  except AssertionError as exc:
    message = "unable to parse wordlist - " + str(exc)
    raise AssertionError(message)

# Exercises basic functionality by parsing and printing a vocab file
if __name__ == '__main__':
  if len(sys.argv) == 1:
    print("Usage: vocabParser.py <vocab xml>")
    sys.exit()
  
  vocabFilePath = sys.argv[1]
  wordlist = loadVocab(vocabFilePath)
  
  for section in wordlist.sections:
    print(str(section))

