| Программирование python |
| 1 | # all methods for text processing class TextProcessor(): def __init__(self, text): words = nltk.PunktWordTokenizer().tokenize(text) words = filter(lambda x: len(x) > 3, words) # filter 1-2 symbols words stemmer = nltk.stem.snowball.RussianStemmer() self.words = map(stemmer.stem, words) # apply stemmer to each word self.originalForm = self.saveOriginalFormOfWords(words) # save original form of word # here it's first occurred word of the text corresponded to the particular token def saveOriginalFormOfWords(self, tokenizedText): originalForm = dict() for i in xrange(len(self.words)): if not(originalForm.has_key(self.words)): originalForm = tokenizedText |
| 2 | return originalForm # return sorted list of pairs (object, number of its occurrences in the list) # it is used for counting unigrams and bigrams in text def objectCounter(self, listOfObjects): return (obj, listOfObjects.count(obj)) for obj in set(listOfObjects) # training of algorithms #here we only compute IDf class Trainer(): # model is a pair of (dict: word -> idf_score) and (number of files in training set) model = None def __init__ (self): if not(self.loadModel()): self.model = self.computeIDF() self.saveModel() |
| 3 | # return TopN object (unigram of bigram) from list def getTopN(self, listOfObjectsWithCounts, N): sortedListOfObjectsWithCounts = sorted(listOfObjectsWithCounts, key=lambda x:-int(x1)):N # return sortedListOfObjectsWithCounts return item0 for item in sortedListOfObjectsWithCounts # Assign weights to unigrams and bigrams # here we use TF*IDF weighting scheme def weightingScheme(self, listOfCountedObjects, model): return (objectName, objectCount * model0.setdefault(objectName, -math.log(1.0 - model1))) for objectName, objectCount in listOfCountedObjects |
Комментарии