hello ,
i want to know if yu please how can i use python code in tagging arabic text file
my code is as follow :
# -*- coding: cp1256 -*-
import codecs
from nltk.tagger import *
from nltk.corpus import brown
from nltk.tokenizer import WhitespaceTokenizer
from nltk import *
from nltk.tokenreader.tagged import TaggedTokenReader
import codecs
from nltk.tagger import *
from nltk.corpus import brown
from nltk.tokenizer import WhitespaceTokenizer
from nltk import *
from nltk.tokenreader.tagged import TaggedTokenReader
# Tokenize ten texts from the Brown Corpus
train_tokens = []
train_tokens = []
text_str = (open('fataha2.txt').read())
#codecs.encode(text_str,'cp1256')
reader = TaggedTokenReader(SUBTOKENS='WORDS')
text_token = reader.read_token(text_str)
print text_token['WORDS']
#codecs.encode(text_str,'cp1256')
reader = TaggedTokenReader(SUBTOKENS='WORDS')
text_token = reader.read_token(text_str)
print text_token['WORDS']
for l in text_token['WORDS']:
train_tokens.append(l)
train_tokens.append(l)
#Initialise and train a unigram tagger
mytagger = UnigramTagger(SUBTOKENS='WORDS')
mytagger = UnigramTagger(SUBTOKENS='WORDS')
for xx in train_tokens:
cc = reader.read_token(xx['TEXT'])
#! print cc.keys()
cc['SUBTOKENS']= cc['WORDS']
mytagger.train(cc)
#Once a UnigramTagger has been trained, the tag() method can be used to tag new text:
text_token = Token(TEXT="ÇáÍãÏ ááå ÑÈ ÇáÚÇáãíä")
WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token)
mytagger.tag(text_token)
#print 'The first example : Using Unigram Tagger the reseults are : '
print
print text_token
cc = reader.read_token(xx['TEXT'])
#! print cc.keys()
cc['SUBTOKENS']= cc['WORDS']
mytagger.train(cc)
#Once a UnigramTagger has been trained, the tag() method can be used to tag new text:
text_token = Token(TEXT="ÇáÍãÏ ááå ÑÈ ÇáÚÇáãíä")
WhitespaceTokenizer(SUBTOKENS='WORDS').tokenize(text_token)
mytagger.tag(text_token)
#print 'The first example : Using Unigram Tagger the reseults are : '
print text_token
and i got the following error :
Traceback (most recent call last):
File "I:/examples/unigramgtag1update1.py", line 13, in ?
codecs.encode(text_str,'cp1256')
File "C:\Python24\lib\encodings\cp1256.py", line 18, in encode
return codecs.charmap_encode(input,errors,encoding_map)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc8 in position 0: ordinal not in range(128)
File "I:/examples/unigramgtag1update1.py", line 13, in ?
codecs.encode(text_str,'cp1256')
File "C:\Python24\lib\encodings\cp1256.py", line 18, in encode
return codecs.charmap_encode(input,errors,encoding_map)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc8 in position 0: ordinal not in range(128)
please help
Yahoo! FareChase - Search multiple travel sites in one click.
-- http://mail.python.org/mailman/listinfo/python-list