Python中Okapi BM25的实现
我正在尝试在Python中实现Okapi BM25算法。虽然我看到了一些教程,但在这个过程中我似乎遇到了困难。
我有一组文档(包含'id'和'text'这两列)和查询(同样包含'id'和'text'这两列)。我已经完成了预处理步骤,现在我的文档和查询都整理成了一个列表:
documents = list(train_docs['text']) #put the documents text to list
queries = list(train_queries_all['text']) #put the queries text to list
然后为了计算BM25,我这样做:
pip install rank_bm25
#计算BM25
from rank_bm25 import BM25Okapi
bm25 = BM25Okapi(documents)
#计算得分
bm_score = BM25Okapi.get_scores(documents, query=queries
)
但是这段代码不管用。
接着我尝试这样做:
import math
import numpy as np
from multiprocessing import Pool, cpu_count
nd = len(documents) # 语料库大小 = 3612
(我不确定这是否必要)
class BM25:
def __init__(self, documents, tokenizer=None):
self.corpus_size = len(documents)
self.avgdl = 0
self.doc_freqs = []
self.idf = {}
self.doc_len = []
self.tokenizer = tokenizer
if tokenizer:
documents = self._tokenize_corpus(documents)
nd = self._initialize(documents)
self._calc_idf(nd)
def _initialize(self, documents):
nd = {} # word -> number of documents with word
num_doc = 0
for document in documents:
self.doc_len.append(len(document))
num_doc += len(document)
frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.doc_freqs.append(frequencies)
for word, freq in frequencies.items():
if word not in nd:
nd[word] = 0
nd[word] += 1
self.avgdl = num_doc / self.corpus_size
return nd
def _tokenize_corpus(self, documents):
pool = Pool(cpu_count())
tokenized_corpus = pool.map(self.tokenizer, documents)
return tokenized_corpus
def _calc_idf(self, nd):
raise NotImplementedError()
def get_scores(self, queries):
raise NotImplementedError()
def get_top_n(self, queries, documents, n=5):
assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"
scores = self.get_scores(queries)
top_n = np.argsort(scores)[::-1][:n]
return [documents[i] for i in top_n]
class BM25T(BM25):
def __init__(self, documents, k1=1.5, b=0.75, delta=1):
# Algorithm specific parameters
self.k1 = k1
self.b = b
self.delta = delta
super().__init__(documents)
def _calc_idf(self, nd):
for word, freq in nd.items():
idf = math.log((self.corpus_size + 1) / freq)
self.idf[word] = idf
def get_scores(self, queries):
score = np.zeros(self.corpus_size)
doc_len = np.array(self.doc_len)
for q in queries:
q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
(self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
return score
然后我尝试获取得分:
score = BM25.get_scores(self=documents, queries)
但我收到的消息是:
score = BM25.get_scores(self=documents, queries)
SyntaxError: 位置参数在关键字参数之后
有没有人知道为什么会出现这个错误?谢谢大家。
相关问题:
- 暂无相关问题
3 个回答
0
现在在bm25中还没有实现这个功能,
2
我建议你使用fastbm25,因为它比其他版本的bm25更快。
`pip install fastbm25
使用方法
from fastbm25 import fastbm25
corpus = [
"How are you !",
"Hello Jack! Nice to meet you!",
"I am from China, I like math."
]
tokenized_corpus = [doc.lower().split(" ") for doc in corpus]
model = fastbm25(tokenized_corpus)
query = "where are you from".lower().split()
result = model.top_k_sentence(query,k=1)
print(result)
你可以从这里了解更多信息:https://github.com/zhusleep/fastbm25
3
1) 将语料库进行分词,或者把分词的功能发送到类里。
2) 只把查询发送到“get_scores”这个功能。
查看官方示例。
from rank_bm25 import BM25Okapi
corpus = [
"Hello there good man!",
"It is quite windy in London",
"How is the weather today?"
]
tokenized_corpus = [doc.split(" ") for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)
query = "windy London"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)