jieba分词+tfidf文本表征+SVM分类

1
2
3
4
5
6
7
8
9
import pandas as pd
import jieba
import numpy as np
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import precision_recall_fscore_support
from sklearn.svm import SVC
1
2
3
4
def read_data(url):
data = pd.read_csv(url,encoding='utf-8')
data.fillna("null",inplace=True) #使用inplace参数会改掉本身
return data

数据清洗

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def clean_text(text):
text = str(text)
text = text.replace('\n', '')
text = text.replace('<br />', ' ')
text = text.replace(';', ',')
return text
def tokenize_df_content(data):
df_empty = pd.DataFrame()
return df_empty.assign(content_tokens=data["content"].map(clean_text)).assign(title_tokens=data['title'].map(clean_text)).assign(label_tokens=data['ann_labels'].map(clean_text))
#将content&title&ann_labels的内容都进行过清洗,形成一个新的dataframe

def stop_words():
stop_words_file = open('stopword.txt', 'r')
stopwords_list = []
for line in stop_words_file.readlines():
stopwords_list.append(line)
return stopwords_list

对content&title进行分词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def jieba_data(data,i):
stopwords_list = stop_words()
data_title=jieba.cut(data['title_tokens'][i])
data_content=jieba.cut(data["content_tokens"][i])
data_result = " "
for title_word in data_title:
if(title_word not in stopwords_list):
data_result += title_word+" "
for content_word in data_content:
if(content_word not in stopwords_list):
data_result += content_word+" "

return data_result

def jieba_label(data_result,data,i):
stopwords_list = stop_words()
data_label = jieba.cut(data['label_tokens'][i])
for label_word in data_label:
if(label_word not in stopwords_list):
data_result += label_word+" "

return data_result

训练集的jieba分词

1
2
3
4
def train_jieba(data,i):
data_result = jieba_data(data,i);
result = jieba_label(data_result,data,i);
return result;

测试集的jieba分词

1
2
3
def test_jieba(data,i):
data = jieba_data(data,i);
return data;

读取数据

1
2
3
4
5
6
7
if __name__ == '__main__':
train_data_x = read_data('df_news_train_ecnu.csv')
test_data_x = read_data('df_news_test_ecnu.csv')
y_train = np.loadtxt("y_train_ecnu.csv", delimiter=",",encoding='utf-8') #读入文件并以矩阵或向量的形式输出
y_test = np.loadtxt("y_test_ecnu.csv", delimiter=",",encoding='utf-8')
print(train_data_x.shape)
print(test_data_x.shape)
输出:(2582, 5)
    (1107, 5)

获得标签值

1
2
3
4
unimportant_idx = 44
train_label_y = y_train[:, unimportant_idx]
test_label_y = y_test[:, unimportant_idx]
print("# unimportant / total news in training data: ", int(sum(y_train[:, unimportant_idx])), "/", len(y_train[:, unimportant_idx]))

分词并保存分词结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
train_data_clean_x = tokenize_df_content(train_data_x)
test_data_clean_x = tokenize_df_content(test_data_x)
print(train_data_x.shape)
print(test_data_x.shape)
print(train_data_clean_x.shape)
print(test_data_clean_x.shape) #此时的标签是content_tokens title_tokens label_tokens

train_x = [] #训练集的语料库
test_x = [] #测试集的语料库
for i in range(len(train_data_clean_x)):
train_x.append(train_jieba(train_data_clean_x,i))
for i in range(len(test_data_clean_x)):
test_x.append(test_jieba(test_data_clean_x,i))

train_x = np.array(train_x)
test_x = np.array(test_x)
np.savetxt("tmp/X_train.csv", train_x, delimiter=",",fmt = "%s",encoding="utf-8")
np.savetxt("tmp/X_test.csv", test_x, delimiter=",",fmt = "%s",encoding="utf-8")
输出:
    (2582, 5)
    (1107, 5)
    (2582, 3)
    (1107, 3)

tfidf 之构造权重矩阵

1
2
3
4
5
6
7
    vectorizer = CountVectorizer()  #只考虑词汇在文本中出现的频率
transformer = TfidfTransformer() #tfidf值
train_tfidf=transformer.fit_transform(vectorizer.fit_transform(train_x))#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
print(train_tfidf.shape)
test_tfidf = transformer.transform(vectorizer.transform(test_x)) #先拟合fit,找到该part的整体指标,对于test,transform保证train、test处理方式相同。
# test_weight = test_tfidf.toarray()
# print(test_weight.shape)
输出:(2582, 72998)

tfidf 之打印特征文本

1
2
3
4
5
6
7
weight = train_tfidf.toarray() 
word = vectorizer.get_feature_names() #获取词袋模型中的所有词语
for i in range(5):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
print("************************第",i,"篇文章的词语tf-idf权重**********************" )
for j in range(len(word)):
if(weight[i][j] >0.1):
print(word[j],weight[i][j])
输出:
    ************************第 0 篇文章的词语tf-idf权重**********************
    一致性 0.30716252706000763    
    仿制 0.21851023802630334
    医疗机构 0.27535485762167283
    原研 0.2036921591200047
    ......

二分类【非重要新闻】¶

1
2
3
4
5
6
7
svmmodel = SVC(C = 1 , kernel= "linear") #kernel:rbf, poly在这里都没有线性的好

nn = svmmodel.fit(train_tfidf,train_label_y)
print(nn)
pre_test_label = svmmodel.predict(test_tfidf)
test_data_x["category"] = pre_test_label
np.savetxt("tmp/test_data_label.csv", test_data_x, delimiter=",",fmt = "%s",encoding="utf-8")

评判指标

1
2
from sklearn.metrics import classification_report
print(classification_report(test_label_y, pre_test_label))
输出:
             precision    recall  f1-score   support

    0.0       0.76      0.92      0.83       783
    1.0       0.59      0.29      0.39       324

    avg / total       0.71      0.73      0.70      1107
0%