1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
| import random import re
import jieba import pandas as pd
def trim(text): """ 带有语料清洗功能的分词函数, 包含数据预处理, 可以根据自己的需求重载 使用re保证了一些本来可能会分开的表情图标不分开 return: [str] """ text = re.sub("\{%.+?%\}", " ", text) text = re.sub("@.+?( |:)", " ", text) text = re.sub("【.+?】", " ", text) text = re.sub("[a-zA-Z0-9]", " ", text) icons = re.findall("\[.+?\]", text) text = re.sub("\[.+?\]", "IconMark", text)
tokens = [] jieba.load_userdict('./data/user_dict.txt') for w in jieba.cut(text): w = w.strip() if "IconMark" in w: for i in range(w.count("IconMark")): tokens.append(icons.pop(0)) elif w and w != '\u200b' and w.isalpha(): tokens.append(w) return tokens
def load_corpus(csvFilePath, stopwordPath): """ 加载语料库,并进行分词,数据清洗,去除停用词 """ df = pd.read_csv(csvFilePath) stopword = load_stopword(stopwordPath) labels, reviews = df['label'].to_list(), df['review'].to_list() trimedReviews = [] for review in reviews: trimedReview = trim(review) finalReview = [] for word in trimedReview: if word not in stopword: finalReview.append(word) trimedReviews.append(finalReview) return labels, trimedReviews
def load_reviews(csvFilePath): df = pd.read_csv(csvFilePath) return df['label'], df['review']
def load_stopword(filePath): """ 加载停用词 """ with open(filePath, encoding='UTF-8') as words: stopword = [word.strip() for word in words] return stopword
def data_suffle(labels, reviews): """ 打乱数据 """ join = list(zip(labels, reviews)) random.shuffle(join) labels, reviews = zip(*join) return list(labels), list(reviews)
def pre_trim(csvFilePath, stopwordPath): """ 预处理csv文本,并持久化 """ df = pd.read_csv(csvFilePath) _, reviews = load_corpus(csvFilePath, stopwordPath) for index in range(len(reviews)): reviews[index] = ' '.join(reviews[index]) df['review'] = reviews df.to_csv(csvFilePath[:-4] + 'Trimed.csv', index=False)
if __name__ == '__main__': csvFilePath = '../../corpus/100k/all.csv' stopwordPath = './data/stopword.txt' pre_trim(csvFilePath, stopwordPath)
|