NLP应用构建一个简单的Dataset【不考虑词频,也可以加一个字典表示词频】
参考:入口
需要处理的语料文件 corpus_engllish.txt 的文件格式大致如下:
The Rock is destined to be the 21st Century Segal . ||| 3 The gorgeously elaborate continuation of `` The Lord of the Rings '' trilogy Middle-earth . ||| 4 Singer/composer Bryan Adams contributes a slew of songs -- a few potential hits , er , spirit of the piece . ||| 3 You 'd think by now America would have had enough of plucky British eccentrics with hearts of gold . ||| 2 Yet the act is still charming here . ||| 3
目标:
1、将每个一行string 切割为一个句子+一个label
2、将所有出现过的词进行编号,每个词对应一个数字编号
3、然后将一个句子中出现过的所有词换成其对应的编号,这样一个句子就由很多个数字替换了,形成了一个向量。
4、现在你的数据形状为: vector:label
5、编写dataset类
其中Data可以如下表这样理解
sentence_vector为下表中的每一行
代码如下import torch from torch.utils.data import Dataset, DataLoader import numpy as np word2index = {} # 词到下标的映射 index2word = {} # 下标到词的映射 word2index.update({'PAD': 0}) index2word.update({0:'PAD'}) def sentence_embeding(corpus_path, max_sentence_length): ''' Parameters ---------- corpus_path:语料库路径 max_sentence_length:最大句子长度 Returns:句向量,标签 ------- ''' stop_word = [',', '.', '(', ')', ';', '-', '!', '@', '#', '$', '%', '^', '&', '*', '[', ']', '{', '}', ':', '/','|',"\" ] sentence_vector = [] # 句向量 label_vector = [] # 标签向量 index = 1 with open(corpus_path, 'r') as file: for line in file.readlines(): sentence_list = [] # 保存当前读出来的句子的向量,每轮循环都需要清空 line_list = line.split("|||") # 分开text与label sentence, label = line_list[0].strip(), line_list[1].strip() sentence_words = [word.strip() for word in sentence.split(" ") if word not in stop_word] # 分词去掉停用字符[停用字符可以自定义] for word in sentence_words: if word2index.get(word) == None: # 查无此词 word2index[word] = index index2word[index] = word index += 1 sentence_list.append(word2index[word]) # 每个句子长度有限制,其句向量自然也有限制 sentence_list.extend([word2index['PAD']] * max_sentence_length) # 添加填充字符 sentence_list = sentence_list[:max_sentence_length] # 句子截断 sentence_vector.append(sentence_list) label_vector.append(int(label)) return np.array(sentence_vector), np.array(label_vector) class dataset(Dataset): def __init__(self, corpus_path, sentence_max_length): super(dataset, self).__init__() self.Data, self.Label = sentence_embeding(corpus_path, sentence_max_length) def __getitem__(self, index): Data, Label = torch.tensor(self.Data[index]), torch.tensor(self.Label[index]) return Data, Label def __len__(self): return len(self.Label) if __name__ == '__main__': path = r'F:pythonMarkDown_code_notebooklearnNLP_Implement_datasetdataset_englishcorpus_english.txt' sentence_length = 10 dataset = dataset(corpus_path=path,sentence_max_length=sentence_length) dataloader = DataLoader(dataset=dataset,batch_size=5,shuffle=True) for data,label in dataloader: print("data",data) print("label",label) break
输出结果为:
data tensor([[2298, 10, 697, 2299, 118, 123, 0, 0, 0, 0], [ 170, 786, 1962, 157, 1963, 15, 1670, 1964, 15, 20], [ 170, 623, 92, 110, 890, 62, 1021, 3, 5, 1468], [ 813, 1609, 1610, 1461, 2131, 956, 248, 7, 1465, 5], [ 1, 4140, 36, 1, 4141, 4142, 248, 63, 4143, 123]], dtype=torch.int32) label tensor([4, 3, 3, 3, 2], dtype=torch.int32)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)