def__get_tf(self, strs): tf_dict = {} line_words = strs.split(" ") total_word_line = len(line_words) for word in line_words: if word notin tf_dict: tf_dict[word] = 1 else: tf_dict[word] = tf_dict[word] + 1 for k, v in tf_dict.items(): tf_dict[k] = v / total_word_line return tf_dict
def__build_iwf(self, lines):
for line in lines: line_words = line.split(" ") for word in line_words: if word notin self.iwf: self.iwf[word] = 1 else: self.iwf[word] = self.iwf[word] + 1 total_word_lines = len(self.iwf.values()) values = [] for k, v in self.iwf.items(): self.iwf[k] = math.log(total_word_lines / v, 10) values.append(math.log(total_word_lines / v, 10)) self.median_iwf = np.median(values)
defget_tfiwf(self, strs): result = dict() tf_dict = self.__get_tf(strs) line_words = strs.split(" ") for word in line_words: if word notin self.iwf.keys(): result[word] = tf_dict[word] * self.median_iwf else: result[word] = tf_dict[word] * self.iwf[word] return result