搭建一个基于检索式的简单的问答系统。
本项目包括:
- 字符串 *** 作
- 文本预处理技术(词过滤,标准化)
- 文本的表示(词袋模型,tf-idf, word2vec)
- 文本相似度计算
- 文本高效检索
数据:
- dev-v2.0.json: 问答对,parser来提取。
- glove.6B: https://nlp.stanford.edu/projects/glove/, 使用d=100的词向量
主要使用sklearn:http://scikit-learn.org/stable/install.html
分词工具jieba, https://github.com/fxsjy/jieba
import json
def read_corpus():
"""
qlist = ["问题1", “问题2”, “问题3” ....]
alist = ["答案1", "答案2", "答案3" ....]
"""
qlist=list()
alist=[]
with open ("train-v2.0.json") as file:
data=json.load(file)
# print(data)
for item in data['data']:
for para in item['paragraphs']:
for qa in para['qas']:
qlist.append(qa['question'])
# 部分answers的list为空,所以会引发IndexError
try:
alist.append(qa['answers'][0]['text'])
except IndexError:
qlist.pop()
assert len(qlist) == len(alist) # 确保长度一样
return qlist, alist
2.2 查看数据(可视化分析/统计信息)
# 统计词频
from collections import Counter
qlist, alist=read_corpus()
word_dict = Counter()
for text in qlist:
word_dict.update(text.strip(' .!?').split(' '))
word_total = len(dict(word_dict))
# print ("一共出现了 %d 个单词"%dif_word_total)
print ("共有 %d 个不同的单词"%word_total)
print(sum(word_dict.values()))
共有 51841 个不同的单词
874076
print(sum(word_dict.values()))
874076
print()
print(len(qlist))
86821
# 统计词频并排序,plot可视化
import matplotlib.pyplot as plt
from collections import Counter
y=[]
for i in word_dict:
y.append(word_dict[i])
plt.subplot(221)
plt.plot(sorted(y,reverse=True))
plt.subplot(222)
plt.plot(sorted(y,reverse=True)[:2000])
plt.subplot(223)
plt.plot(sorted(y,reverse=True)[:200])
plt.subplot(224)
plt.plot(sorted(y,reverse=True)[:20])
plt.show()
value_sort = sorted(word_dict.values(), reverse=True)
count_qlist_top =dict(zip(word_dict.values(), word_dict.keys()))
print ("共有 %d 个不同的单词"%word_total)
print([[count_qlist_top[v], v] for v in value_sort[:10]])
print(sum(word_dict.values()))
# counter=Counter()
# for text in alist:
# counter.update(text.strip(' .!?').split(' '))
# alist_value_sort = sorted(counter.values(), reverse=True)
# count_alist_top =dict(zip(counter.values(), counter.keys()))
# print([[count_alist_top[v], v] for v in alist_value_sort[:10]])
共有 51841 个不同的单词
[['the', 60960], ['What', 36995], ['of', 33987], ['in', 21785], ['to', 18443], ['was', 17065], ['is', 16198], ['did', 15634], ['what', 13219], ['a', 10753]]
874076
2.3 文本预处理
# 处理英文
# 停用词、大小写、符号、低频词、数字处理( "#number")、词根。。。。。
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import math
sw = set(stopwords.words('english'))
sw -= {'who', 'when', 'why', 'where', 'how'}
sw.update(['\'s', '``', '\'\''])
ps = PorterStemmer()
def text_processing(text):
seg=[]
for word in word_tokenize(text):
word =ps.stem(word.lower())
word = "#number" if word.isdigit() else word
if len(word)> 1 and word not in sw :
seg.append(word)
return seg
counter_qlist=Counter()
list_q=[]
for text in qlist:
seg=text_processing(text)
list_q.append(seg)
counter_qlist.update(seg)
sorted_value=sorted(counter_qlist.values(),reverse=True)
min_tf = sorted_value[int(math.exp(0.99 * math.log(len(counter_qlist))))]
for i in range(len(list_q)):
list_q[i] = [word for word in list_q[i] if counter_qlist[word] > min_tf]
qlist=list_q
# qlist, alist = # 更新后的
len(qlist)
86821
print(len(counter_qlist))
28170
print(sum(counter_qlist.values()))
530774
print(min_tf)
1
print(qlist[:10])
[['when', 'beyonc', 'start', 'becom', 'popular'], ['area', 'beyonc', 'compet', 'when', 'wa', 'grow'], ['when', 'beyonc', 'leav', 'destini', 'child', 'becom', 'solo', 'singer'], ['citi', 'state', 'beyonc', 'grow'], ['decad', 'beyonc', 'becom', 'famou'], ['group', 'wa', 'lead', 'singer'], ['album', 'made', 'worldwid', 'known', 'artist'], ['who', 'manag', 'destini', 'child', 'group'], ['when', 'beyoncé', 'rise', 'fame'], ['role', 'beyoncé', 'destini', 'child']]
Zipf’s law 假设文本的词频符合该定律,那么对1/n进行积分得到ln(n),为了使99%的文本得到覆盖则需ln(x)>0.99*ln(n),n是词type数,x是词频从高到底排列时的阈值分割点,最后x=e^(0.99*ln(n))。
2.4 文本表示文本转换成向量。
# tf-idf向量。
#X: N* D。 N是问题的个数(样本个数),
# D是字典库的大小。
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform([' '.join(seg) for seg in qlist])
# 计算矩阵的稀疏度
def sparsity_ratio(X):
return 1.0 - X.nnz / float(X.shape[0] * X.shape[1])
print(X.shape)
print("input sparsity ratio:", sparsity_ratio(X)) # 打印出稀疏度(sparsity)
(86821, 14547)
input sparsity ratio: 0.9995936879973627
2.5 对于用户的输入问题,找到相似度最高的TOP5问题,并把5个潜在的答案做返回
from queue import PriorityQueue
def top5results(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。实现功能:
1. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
2. 计算跟每个库里的问题之间的相似度
3. 找出相似度最高的top5问题的答案
"""
q_vector = vectorizer.transform([' '.join(text_processing(input_q))])
print(q_vector.shape)
sim = (X * q_vector.T).toarray()
pq = PriorityQueue()
for cur in range(sim.shape[0]):
pq.put((sim[cur][0], cur))
if len(pq.queue) > 5:
pq.get()
pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])
top_idxs = [x[1] for x in pq_rank]
return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案
ts=[' '.join(text_processing("Which airport was shut down?"))]
print(ts)
['airport wa shut']
# 人工测试
print(top5results("Which airport was shut down?")) # 在问题库中存在,经过对比,返回的首结果正确
print(top5results("Which airport is closed?"))
print(top5results("What government blocked aid after Cyclone Nargis?")) # 在问题库中存在,经过对比,返回的首结果正确
print(top5results("Which government stopped aid after Hurricane Nargis?"))
(1, 14547)
['Chengdu Shuangliu International Airport', 'Chengdu Shuangliu International Airport', 'aerodrome with facilities for flights to take off and land', 'newspapers', 'various gaming sites']
(1, 14547)
['Plymouth City Airport', 'aerodrome with facilities for flights to take off and land', 'related', 'After the reunification', 'Nanjing Dajiaochang Airport']
(1, 14547)
['Myanmar', 'foreign aid', '10 days', 'the British government', 'The latent heat of water condensation amplifies convection']
(1, 14547)
['Myanmar', 'Isabel', 'foreign aid', 'Soviet Union and China', '10 days']
# 时间复杂度和空间复杂度:
时间复杂度 = O(N), 空间复杂度 = O(N)
2.6 利用倒排表的优化。
建立倒排表加速。
from collections import defaultdict
# 定义倒排表
inverted_idx = defaultdict(set)
for cur in range(len(qlist)):
for word in qlist[cur]:
inverted_idx[word].add(cur)
# 关键词检索回答语句
def top5results_invidx(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。函数功能:
1. 利用倒排表来筛选 candidate
2. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
3. 计算跟每个库里的问题之间的相似度
4. 找出相似度最高的top5问题的答案
"""
input_seg=text_processing(input_q)
candidates = set()
for word in input_seg:
# 取所有包含任意一个词的文档的并集
candidates = candidates | inverted_idx[word]
candidates = list(candidates)
q_vector = vectorizer.transform([' '.join(input_seg)])
sim = (X[candidates] * q_vector.T).toarray()
pq = PriorityQueue()
for cur in range(sim.shape[0]):
pq.put((sim[cur][0], candidates[cur]))
if len(pq.queue) > 5:
pq.get()
pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])
print([x[0] for x in pq_rank])
top_idxs = [x[1] for x in pq_rank] # top_idxs存放相似度最高的(存在qlist里的)问题的下表
return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案
# 人工测试
print(top5results_invidx("Which airport was shut down?")) # 在问题库中存在,经过对比,返回的首结果正确
print(top5results_invidx("Which airport is closed?"))
print(top5results_invidx("What government blocked aid after Cyclone Nargis?")) # 在问题库中存在,经过对比,返回的首结果正确
print(top5results_invidx("Which government stopped aid after Hurricane Nargis?"))
[1.0, 1.0, 0.6058098700501424, 0.5955903794475756, 0.5604486086527194]
['Chengdu Shuangliu International Airport', 'Chengdu Shuangliu International Airport', 'aerodrome with facilities for flights to take off and land', 'newspapers', 'various gaming sites']
[0.7797154765957257, 0.7103241311289762, 0.7038747251719334, 0.6245909883904857, 0.5811739588019266]
['Plymouth City Airport', 'aerodrome with facilities for flights to take off and land', 'related', 'After the reunification', 'Nanjing Dajiaochang Airport']
[0.9999999999999998, 0.7852110277213404, 0.49331031138548853, 0.4162177525363464, 0.33229596712940707]
['Myanmar', 'foreign aid', '10 days', 'the British government', 'The latent heat of water condensation amplifies convection']
[0.5947629389746683, 0.39360612204759465, 0.35791876809775003, 0.31237667954304615, 0.29996990837431825]
['Myanmar', 'Isabel', 'foreign aid', 'Soviet Union and China', '10 days']
# 时间复杂度和空间复杂度:
时间复杂度 = O(1), 空间复杂度 = O(1)
2.7 基于词向量的文本表示
glove词向量模型: https://nlp.stanford.edu/projects/glove/ (使用glove.6B.zip),使用d=100的词向量。
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
# _ = glove2word2vec('glove.6B.100d.txt', 'glove2word2vec.6B.100d.txt')
model = KeyedVectors.load_word2vec_format('glove2word2vec.6B.100d.txt')
def docvec_get(seg):
"""
将分词数据转为句向量。
seg: 分词后的数据
return: 句向量
"""
vector = np.zeros((1, 100))
size = len(seg)
for word in seg:
try:
vector += model.wv[word]
except KeyError:
size -= 1
return vector / size
X = np.zeros((len(qlist), 100))
for cur in range(X.shape[0]):
X[cur] = docvec_get(qlist[cur])
# 计算X每一行的l2范数
Xnorm2 = np.linalg.norm(X, axis=1, keepdims=True)
X = X / Xnorm2
D:\soft\anaconda\lib\site-packages\ipykernel_launcher.py:16: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
app.launch_new_instance()
D:\soft\anaconda\lib\site-packages\ipykernel_launcher.py:20: RuntimeWarning: invalid value encountered in true_divide
# 读取D*H的矩阵(词向量),D为词典库的大小, H为词向量的大小。
# 忽略生词,使用平均词向量表示句向量
def top5results_emb(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。要实现的功能:
1. 利用倒排表来筛选 candidate
2. 对于用户的输入 input_q,转换成句子向量
3. 计算跟每个库里的问题之间的相似度
4. 找出相似度最高的top5问题的答案
"""
# 预处理
seg = text_processing(input_q)
# 用建好的倒排表
candidates = set()
for word in seg:
# 取所有包含任意一个词的文档的并集
candidates = candidates | inverted_idx[word]
candidates = list(candidates)
q_vector = docvec_get(seg)
# 计算问题向量的l2范数
qnorm2 = np.linalg.norm(q_vector, axis=1, keepdims=True)
q_vector = q_vector / qnorm2
# 计算余弦相似度,前边已经l2规范化过,所以直接相乘
sim = (X[candidates] @ q_vector.T)
# 使用优先队列找出top5
pq = PriorityQueue()
for cur in range(sim.shape[0]):
pq.put((sim[cur][0], candidates[cur]))
if len(pq.queue) > 5:
pq.get()
pq_rank = sorted(pq.queue, reverse=True, key=lambda x:x[0])
print([x[0] for x in pq_rank])
top_idxs = [x[1] for x in pq_rank] # top_idxs存放相似度最高的(存在qlist里的)问题的下表
return [alist[i] for i in top_idxs] # 返回相似度最高的问题对应的答案,作为TOP5答案
# 人工测试
print(top5results_emb("Which airport was shut down?")) # 在问题库中存在,经过对比,返回的首结果正确
print(top5results_emb("Which airport is closed?"))
print(top5results_emb("What government blocked aid after Cyclone Nargis?")) # 在问题库中存在,经过对比,返回的首结果正确
print(top5results_emb("Which government stopped aid after Hurricane Nargis?"))
D:\soft\anaconda\lib\site-packages\ipykernel_launcher.py:16: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).
app.launch_new_instance()
[1.0, 1.0, 0.8875869259222657, 0.8826214612899685, 0.83558088872733]
['Chengdu Shuangliu International Airport', 'Chengdu Shuangliu International Airport', 'Terminal C', 'Nanjing Dajiaochang Airport', '1967']
[0.9454294862808652, 0.9029611996952854, 0.9029611996952854, 0.9029611996952854, 0.8917413888585661]
['Plymouth City Airport', 'southern suburbs of Paris', 'within the departure areas', 'India', 'Dushanbe International Airport']
[1.0, 0.852360897734157, 0.8518187365307015, 0.8508247887568897, 0.8409244964740952]
['Myanmar', 'most Protestants (and most Jews)', 'lower house of parliament', 'the Tzu Chi Foundation', 'started an anti-separatist campaign']
[0.8828545495470352, 0.8348415264745357, 0.8166760602126991, 0.810772868269737, 0.7993383778232652]
['Myanmar', 'the Tzu Chi Foundation', 'started an anti-separatist campaign', 'public gaze', 'most Protestants (and most Jews)']
3. 拼写纠错
功能 自动修复用户输入错误的单词。
方法:noisy channel model。 公式:
c ∗ = argmax c ∈ c a n d i d a t e s p ( c ∣ s ) = argmax c ∈ c a n d i d a t e s p ( s ∣ c ) p ( c ) c^* = \text{argmax}_{c\in candidates} ~~p(c|s) = \text{argmax}_{c\in candidates} ~~p(s|c)p(c) c∗=argmaxc∈candidates p(c∣s)=argmaxc∈candidates p(s∣c)p(c)
candidates
:针对于错误的单词的候选集,这部分我们可以假定是通过edit_distance来获取的(比如生成跟当前的词距离为1/2的所有的valid 单词。 valid单词可以定义为存在词典里的单词。 c
代表的是正确的单词, s
代表的是用户错误拼写的单词。 所以我们的目的是要寻找出在candidates
里让上述概率最大的正确写法c
。
p
(
s
∣
c
)
p(s|c)
p(s∣c),这个概率我们可以通过历史数据来获得,也就是对于一个正确的单词
c
c
c, 有百分之多少人把它写成了错误的形式1,形式2… 这部分的数据可以从spell_errors.txt
里面找得到。但在这个文件里,我们并没有标记这个概率,所以可以使用uniform probability来表示。这个也叫做channel probability。
p
(
c
)
p(c)
p(c),这一项代表的是语言模型,也就是假如我们把错误的
s
s
s,改造成了
c
c
c, 把它加入到当前的语句之后有多通顺?在本次项目里我们使用bigram来评估这个概率。 举个例子: 假如有两个候选
c
1
,
c
2
c_1, c_2
c1,c2, 然后我们希望分别计算出这个语言模型的概率。 由于我们使用的是bigram
, 我们需要计算出两个概率,分别是当前词前面和后面词的bigram
概率。 用一个例子来表示:
给定: We are go to school tomorrow
, 对于这句话我们希望把中间的go
替换成正确的形式,假如候选集里有个,分别是going
, went
, 这时候我们分别对这俩计算如下的概率:
p
(
g
o
i
n
g
∣
a
r
e
)
p
(
t
o
∣
g
o
i
n
g
)
p(going|are)p(to|going)
p(going∣are)p(to∣going)和
p
(
w
e
n
t
∣
a
r
e
)
p
(
t
o
∣
w
e
n
t
)
p(went|are)p(to|went)
p(went∣are)p(to∣went), 然后把这个概率当做是
p
(
c
)
p(c)
p(c)的概率。 然后再跟channel probability
结合给出最终的概率大小。
那这里的
p
(
a
r
e
∣
g
o
i
n
g
)
p(are|going)
p(are∣going)这些bigram概率又如何计算呢?答案是训练一个语言模型! 但训练一个语言模型需要一些文本数据,这里使用nltk
自带的reuters
的路透社新闻文本类数据来训练一个语言模型。当然,如果你有资源你也可以尝试其他更大的数据。最终目的就是计算出bigram
概率。
使用nltk
自带的reuters
数据来训练一个语言模型。 使用add-one smoothing
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
import numpy as np
import codecs
# 读取语料库的数据
categories = reuters.categories()
corpus = reuters.sents(categories=categories)
#print(corpus[0])
# 循环所有的语料库并构建bigram probability. bigram[word1][word2]: 在word1出现的情况下下一个是word2的概率。
new_corpus = []
for sent in corpus:
#句子前后加入,表示开始和结束
new_corpus.append([' '] + sent + [' '])
print(new_corpus[0])
word2id = dict()
id2word = dict()
for sent in new_corpus:
for w in sent:
w = w.lower()
if w in word2id:
continue
id2word[len(word2id)] = w
word2id[w] = len(word2id)
vocab_size = len(word2id)
count_uni = np.zeros(vocab_size)
count_bi = np.zeros((vocab_size,vocab_size))
#writeVocab(word2id,"lm_vocab.txt")
for sent in new_corpus:
for i,w in enumerate(sent):
w = w.lower()
count_uni[word2id[w]] += 1
if i < len(sent) - 1:
count_bi[word2id[w],word2id[sent[i + 1].lower()]] += 1
print("unigram done")
bigram = np.zeros((vocab_size,vocab_size))
#计算bigram LM,有bigram统计值的加一除以|vocab|+uni统计值,没有统计值,
#1 除以 |vocab|+uni统计值
for i in range(vocab_size):
for j in range(vocab_size):
if count_bi[i,j] == 0:
bigram[i,j] = 1.0 / (vocab_size + count_uni[i])
else:
bigram[i,j] = (1.0 + count_bi[i,j]) / (vocab_size + count_uni[i])
def checkLM(word1,word2):
if word1.lower() in word2id and word2.lower() in word2id:
return bigram[word2id[word1.lower()],word2id[word2.lower()]]
else:
return 0.0
print(checkLM('I','like'))
3.2 构建Channel Probs
基于spell_errors.txt
文件构建channel probability
, 其中
c
h
a
n
n
e
l
[
c
]
[
s
]
channel[c][s]
channel[c][s]表示正确的单词
c
c
c被写错成
s
s
s的概率。
channel = {}
#读取文件,格式为w1:w2,w3..
#w1为正确词,w2,w3...为错误词
#没有给出不同w2-wn的概率,暂时按等概率处理
for line in open('spell-errors.txt'):
# TODO
(correct,error) = line.strip().split(':')
errors = error.split(',')
errorProb = dict()
for e in errors:
errorProb[e.strip()] = 1.0 / len(errors)
channel[correct.strip()] = errorProb
3.3 根据错别字生成所有候选集合
给定一个错误的单词,生成跟这个单词距离为1或者2的所有的候选集合。
def filter(words):
#将不在词表中的词过滤
new_words = []
for w in words:
if w in word2id:
new_words.append(w)
return set(new_words)
def generate_candidates1(word):
#生成DTW距离为1的词,
#对于英语来说,插入,替换,删除26个字母
chars = 'abcdefghijklmnopqrstuvwxyz'
words = set([])
#insert 1
words = set(word[0:i] + chars[j] + word[i:] for i in range(len(word)) for j in range(len(chars)))
#sub 1
words = words | set(word[0:i] + chars[j] + word[i+1:] for i in range(len(word)) for j in range(len(chars)))
#delete 1
words = words | set(word[0:i] + word[i + 1:] for i in range(len(chars)))
#交换相邻
#print(set(word[0:i - 1] + word[i] + word[i - 1] + word[i + 1:] for i in range(1,len(word))))
words = words | set(word[0:i - 1] + word[i] + word[i - 1] + word[i + 1:] for i in range(1,len(word)))
#将不在词表中的词去掉
words = filter(words)
#去掉word本身
if word in words:
words.remove(word)
return words
def generate_candidates(word):
# 基于拼写错误的单词,生成跟它的编辑距离为1或者2的单词,并通过词典库的过滤。
# 只留写法上正确的单词。
words = generate_candidates1(word)
words2 = set([])
for word in words:
#将距离为1词,再分别计算距离为1的词,
#作为距离为2的词候选
words2 = generate_candidates1(word)
#过滤掉不在词表中的词
words2 = filter(words)
#距离为1,2的词合并列表
words = words | words2
return words
words = generate_candidates('strat')
print(words)
3.4 给定一个输入,若有错误需要纠正
用户输入一个输入query
, 检查是否拼写错误。对于query
分词,分词后在词库中进行搜索,如果拼写错误了再通过channel
和bigram
来计算最适合的候选。
import numpy as np
import queue as Q
def word_corrector(word,context):
word = word.lower()
candidate = generate_candidates(word)
if len(candidate) == 0:
return word
correctors = Q.PriorityQueue()
for w in candidate:
if w in channel and word in channel[w] and w in word2id and context[0].lower() in word2id and context[1].lower() in word2id:
probility = np.log(channel[w][word] + 0.0001) + np.log(bigram[word2id[context[0].lower()],word2id[w]]) + np.log(bigram[word2id[context[1].lower()],word2id[w]])
correctors.put((-1 * probility,#### 4.5 基于拼写纠错算法,实现用户输入自动矫正
# 首先有了用户的输入``query``, 然后做必要的处理把句子转换成tokens的形状,然后对于每一个token比较是否是valid, 如果不是的话就进行下面的修正过程。 w))
if correctors.empty():
return word
return correctors.get()[1]
word = word_corrector('strat',('to','in'))
print(word)
def spell_corrector(line):
# 1. 首先做分词,然后把``line``表示成``tokens``
# 2. 循环每一token, 然后判断是否存在词库里。如果不存在就意味着是拼写错误的,需要修正。
# 修正的过程就使用上述提到的``noisy channel model``, 然后从而找出最好的修正之后的结果。
new_words = []
words = [''] + line.strip().lower().split(' ') + ['']
for i,word in enumerate(words):
if i == len(words) - 1:
break
word = word.lower()
if word not in word2id:
#认为错误,需要修正,句子前后加了,
#不在词表中词,肯定位于[1,len - 2]之间
new_words.append(word_corrector(word,(words[i - 1].lower(),words[i + 1].lower())))
else:
new_words.append(word)
newline = ' '.join(new_words[1:])
return newline # 修正之后的结果,假如用户输入没有问题,那这时候``newline = line``
sentence = spell_corrector('When did Beyonce strat becoming popular')
print(sentence)
3.5 基于拼写纠错算法,实现用户输入自动矫正
首先有了用户的输入query
, 把句子转换成tokens的形状,然后对于每一个token比较是否是valid, 如果不是的话就进行下面的修正过程。
test_query1 = "" # 拼写错误的
test_query2 = "" # 拼写错误的
test_query1 = spell_corector(test_query1)
test_query2 = spell_corector(test_query2)
print (get_top_results_tfidf(test_query1))
print (get_top_results_w2v(test_query1))
print (get_top_results_bert(test_query1))
print (get_top_results_tfidf(test_query2))
print (get_top_results_w2v(test_query2))
print (get_top_results_bert(test_query2))
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)