I have a Python code that take an Arabic word and get the root and also remove diacritics, but i I have a problem with the output. For example : when the input is "العربيه" the output is:"عرب" which is right answer but when the input is "كاتب" the output is:"ب", and when the input is "يخاف" the output is " خف".
This is my code: # -*- coding=utf-8 -*- import re from arabic_const import * import Tashaphyne from Tashaphyne import * import enum from enum import Enum search_type=Enum('unvoc_word','voc_word','root_word') HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]") HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]"); ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]"); LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]"); #-------------------------------------- def strip_tashkeel(w): "strip vowel from a word and return a result word" return HARAKAT_pat.sub('', w) #strip tatweel from a word and return a result word #-------------------------------------- def strip_tatweel(w): "strip tatweel from a word and return a result word" return re.sub(ur'[%s]' % TATWEEL, '', w) #-------------------------------------- def normalize_hamza(w): "strip vowel from a word and return a result word" w = ALEFAT_pat.sub(ALEF, w) return HAMZAT_pat.sub(HAMZA, w) #-------------------------------------- def normalize_lamalef(w): "strip vowel from a word and return a result word" return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w) #-------------------------------------- def normalize_spellerrors(w): "strip vowel from a word and return a result word" w = re.sub(ur'[%s]' % TEH_MARBUTA, HEH, w) return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, w) def guess_stem(self,word): """ Detetect affixed letters based or phonetic root composition. In Arabic language, there are some letters which can't be adjacent in a root. This function return True, if the word is valid, else, return False @param word: the word. @type word: unicode. @return: word with a '-' to indicate the stemming position. @rtype: unicode """ # certain roots are forbiden in arabic #exprimed in letters sequences # but this sequence can be used for affixation #then we can guess that this letters are affixed # #treat one prefixe letter # we strip harkat and shadda word=ar_strip_marks(word); prefixes_letters=(TEH, MEEM,LAM,WAW,BEH, KAF,FEH,HAMZA,YEH,NOON) prefixes_forbiden={ ALEF_HAMZA_ABOVE:(ALEF_HAMZA_ABOVE,ZAH,AIN,GHAIN), BEH:(BEH,FEH,MEEM), TEH:(THEH,DAL,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH), FEH:(BEH,FEH,MEEM), KAF:(JEEM,DAD,TAH,ZAH,QAF,KAF), LAM:(REH,SHEEN,LAM,NOON), MEEM:(BEH,FEH,MEEM), NOON:(REH,LAM,NOON), WAW:(WAW,YEH), YEH:(THEH,JEEM,HAH,KHAH,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH,GHAIN,KAF,HEH,YEH), } word_guess=word; if len(word)>=2: c1=word[0]; c2=word[1]; # if c1 in prefixes_letters and (c1 in prefixes_forbiden.keys() and c2 in prefixes_forbiden[c1]): if prefixes_forbiden.has_key(c1) and c2 in prefixes_forbiden[c1]: word_guess=u"%s-%s"%(c1,word[1:]) if len(word_guess)>=4: c1=word_guess[2]; c2=word_guess[3]; if c1 in prefixes_letters and ( c2 in prefixes_forbiden[c1]): word_guess=u"%s-%s"%(c1,word_guess[2:]) # treat two suffixe letters bisuffixes_letters=(KAF+MEEM,KAF+NOON,HEH+MEEM,HEH+NOON) bisuffixes_forbiden={ HEH+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH), KAF+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM, KHAH,ZAIN,SEEN, SHEEN,DAD, TAH,ZAH,GHAIN, FEH, QAF,KAF, LAM, NOON, HEH,YEH), HEH+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH), KAF+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH,THAL,SHEEN,DAD, TAH,ZAH,AIN, GHAIN, QAF,KAF, NOON, HEH,YEH), } ## word_guess=word; word=word_guess; if len(word)>=3: bc_last=word[-2:]; bc_blast=word[-3:-2] if bc_last in bisuffixes_letters: if bc_blast in bisuffixes_forbiden[bc_last]: word_guess=u"%s-%s"%(word[:-2],bc_last) # treat one suffixe letters suffixes_letters=(KAF,TEH,HEH) suffixes_forbiden={ TEH:(THEH,JEEM,DAL,THAL,ZAIN,SHEEN,TAH,ZAH), KAF:(THEH,JEEM,KHAH, THAL,TAH,ZAH,GHAIN,QAF), HEH:(TEH,HAH,KHAH,DAL,REH,SEEN,SHEEN,SAD,ZAH,AIN,GHAIN), } word=word_guess; c_last=word[-1:]; c_blast=word[-2:-1] if c_last in suffixes_letters: if c_blast in suffixes_forbiden[c_last]: word_guess=u"%s-%s"%(word[:-1],c_last) return word_guess; def normalize_text(word,searchtype): word = strip_tashkeel(word) print word word = strip_tatweel(word) print word word = normalize_lamalef(word) print word word = normalize_hamza(word) print word word = normalize_spellerrors(word) print word if searchtype==search_type.root_word.index: """ArListem=ArabicLightStemmer() stem=ArListem.lightStem(word) word=ArListem.get_stem() print word w=ArListem.get_prefix() print w word=ArListem.get_root()""" word=guess_stem(word,w) print word return word -- http://mail.python.org/mailman/listinfo/python-list