Here is a script I’ve written for python3 (I use Wasta Linux 18.04). Note that I have hard-coded the path for the input and output in the code…
#!/usr/bin/python3
# run with python3 convertLexicon2SFM.py3
# BEST TO IMPORT INTO TEXT AND WORDS - not into the dictionary itself.
# -Import into Standard Format Words and Glosses
#
# MODIFIED AND ONLY MILDLY TESTED!!!! It only includes words that 1.) have glosses
# and 2.) are marked as correctly spelled and 3.) exist in the text.
#
import codecs
from lxml import etree
xmldoc = etree.parse("/home/justin/Desktop/Lexicon.xml")
#Paratext Wordlist Export as XML
wordlist = etree.parse("/home/justin/Desktop/wordlist.xml")
outfile=codecs.open("/home/justin/Desktop/PT7_dictionary_py3.sfm", mode="w", encoding='utf-8')
outfile.write ("\_sh v3.0 400 MDF\n\_DateStampHasFourDigitYear\n\n")
#correctWords=spellings.getroot().findall("Status")
correctWords=wordlist.getroot().findall("item")
wordlistTotal=len(correctWords)
approvedWords=[]
#for index,word in reversed( list( enumerate(correctWords) ) ) :
for index,word in reversed( list( enumerate(correctWords) ) ) :
if word.attrib['spelling'] == "Correct" :
#if word.attrib['State'] == "W" :
del correctWords[index]
approvedWords.append(word.attrib['word'])
#approvedWords.append(word.attrib['Word'])
itemList = xmldoc.getroot().findall("Entries/item")
for item in itemList :
Lexeme=next( item.iter("Lexeme") )
if (Lexeme.get('Type') == 'Word') and (Lexeme.get('Form') not in approvedWords) :
print ( 'unused', Lexeme.get('Type'), Lexeme.get('Form'), "wordlist", wordlistTotal, "incorrect", len(correctWords) )
continue
print ( 'good', Lexeme.get('Type'), Lexeme.get('Form'), 'count', approvedWords.count(Lexeme.get('Form')), 'unglossed remaining', len(approvedWords) )
if approvedWords.count(Lexeme.get('Form')) :
approvedWords.remove( Lexeme.get('Form') )
outfile.write ("\n\\lx ")
if Lexeme.get("Type") == "Suffix" :
outfile.write ("-")
#outfile.write ("-", end='')
outfile.write ("%s" % Lexeme.get("Form"))
#outfile.write ("%s" % Lexeme.get("Form"), end='')
if Lexeme.get("Type") == "Prefix" :
outfile.write ("-")
#outfile.write ("-", end='')
outfile.write ("\n")
outfile.write ("\\co_Eng %s\n" % next( item.iter("Lexeme") ).get("Type"))
entryList = item.iter("Gloss")
sense=1
for element in entryList :
if element.get("Language") == "English" :
outfile.write ("\\sn %s\n" % sense)
outfile.write ("\\ge %s\n" % element.text )
sense+=1
if element.get("Language") == "Korean" :
outfile.write ("\\sn %s\n" % sense)
outfile.write ("\\g_Kor %s\n" % element.text )
sense+=1
for Lexeme in approvedWords :
outfile.write ("\n\\lx %s\n" % Lexeme)
print ("adding unglossed words", len(approvedWords), Lexeme )
outfile.close()