import os import jieba import chardet path = "D:/python/复旦中文文本训练集/train/" path1 = "D:/python/复旦中文文本训练集/" def check(path: str): with open(path, 'rb') as f: print(chardet.detect(f.read())['encoding'], ': ', i)
发现都是GB2312编码
但是open起来就有问题了
# 获取语料库文件夹下所有文件名 filelist = [] for i in os.listdir(path): for j in os.listdir(path+i): filelist.append(path+i+"/"+j) # 语料拼接起来放一起 str = "" for file in filelist: with open(file, encoding='GB2312') as f: article = f.read() str += article
报错:'gb2312' codec can't decode byte 0xaa in position 134: illegal multibyte sequence
网上一搜,说GB2312对于繁体字会报错,于是改成更广范围的编码格式gb18030。大部分的文件都好了,还是有些报错的,不过很少了
祭出绝招errors = 'ignore',搞定。
str = "" for file in filelist: with open(file, encoding='gb18030',errors = 'ignore') as f: article = f.read() str += article fo = open(path1+"train.txt", 'w', encoding='utf-8') fo.write(str) fo.close()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)