ElasticSearch:在我的Django web应用程序中索引文档

2024-04-19 02:38:52 发布

您现在位置:Python中文网/ 问答频道 /正文

我第一次尝试在我的web应用程序中使用ElasticSearch,但是我的ES indexation有困难。你知道吗

过程是:

  1. 当我添加一个带有upload字段的文档时,该文档由ES索引
  2. 我有一个函数可以定义一个新的文档标题
  3. 我必须用新标题重新索引此文档,以便出现在ES文档列表中

最后一部分我有些问题,我想知道你是否能帮我。你知道吗

实际进程:

型号.py文件:

class Document(EdqmFullTable):
    CAT_CHOICES = (...)

    ...
    file = models.FileField(upload_to=upload_file)

    def get_filename(self):
        return os.path.join(settings.MEDIA_ROOT, str(self.file))

通过此模型添加新文档时,将调用ES方法:

es4omcl.py公司文件:

class EdqmES(object):
    host = 'localhost'
    port = 9200
    es = None

    def __init__(self, *args, **kwargs):
        self.host = kwargs.pop('host', self.host)
        self.port = kwargs.pop('port', self.port)

        # Connect to ElasticSearch server
        self.es = Elasticsearch([{
            'host': self.host,
            'port': self.port
        }])

    def __str__(self):
        return self.host + ':' + self.port

    @staticmethod
    def file_encode(filename):
        with open(filename, "rb") as f:
            return b64encode(f.read()).decode('utf-8')

    def create_pipeline(self):
        body = {
            "description": "Extract attachment information",
            "processors": [
                {"attachment": {
                    "field": "data",
                    "target_field": "attachment",
                    "indexed_chars": -1
                }},
                {"remove": {"field": "data"}}
            ]
        }
        self.es.index(
            index='_ingest',
            doc_type='pipeline',
            id='attachment',
            body=body
        )

    def index_document(self, doc, bulk=False):
        filename = doc.get_filename()

        try:
            data = self.file_encode(filename)
        except IOError:
            data = ''
            print('ERROR with ' + filename)
            # TODO: log error

        item_body = {
            '_id': doc.id,
            'data': data,
            'relative_path': str(doc.file),
            'title': doc.title,
        }

        if bulk:
            return item_body

        result1 = self.es.index(
            index='omcl', doc_type='annual-report',
            id=doc.id,
            pipeline='attachment',
            body=item_body,
            request_timeout=60
        )
        print(result1)
        return result1

以及来自的信号回调.py新文档保存到数据库时的文件:

@receiver(signals.post_save, sender=Document, dispatch_uid='add_new_doc')
def add_document_handler(sender, instance=None, created=False, **kwargs):
    """ When a document is created index new annual report (only) with Elasticsearch and update conformity date if the
    document is a new declaration of conformity

    :param sender: Class which is concerned
    :type sender: the model class
    :param instance: Object which was just saved
    :type instance: model instance
    :param created: True for a creation, False for an update
    :type created: boolean
    :param kwargs: Additional parameter of the signal
    :type kwargs: dict
    """

    if not created:
        return

    # Update Conformity declaration date
    if instance.category == Document.OPT_CD:
        now = datetime.today()
        Omcl.objects.filter(id=instance.omcl_id).update(last_conformity=now)

    # Index only annual reports
    elif instance.category == Document.OPT_ANNUAL:
        es = EdqmES()
        es.index_document(instance)

我的流程:

我定义了一个新的类,允许在文档上传后立即处理它们。我可以修改文档标题。最后一步是:用ES重新索引这个修改过的文档。你知道吗

class ManageDocView(AdminRequiredMixin, View, BaseException):
    """ Render the Admin Manage documents to update year in the filename"""

    template_name = 'omcl/manage_doc_form.html'
    form_class = ManageDocForm
    success_url = 'omcl/manage_doc_form.html'

    def get(self, request):
        form = self.form_class()
        context = {
            "form": form
        }
        return render(request, self.template_name, context)

    def post(self, request):
        form = self.form_class()
        query_document_updated = None
        query_omcl = None
        query_document = None

        if "SearchOMCL" in request.POST:
            omcl_list = request.POST['omcl_list']
            query_omcl = Omcl.objects.get(id=omcl_list)
            query_document = Document.objects.filter(omcl=omcl_list)


        elif "UpdateDocument" in request.POST:
            checkbox_id = request.POST['DocumentChoice']
            checkbox_id_minus_1 = int(checkbox_id) - 1

            query_document_updated = Document.objects.get(id=checkbox_id)
            print(query_document_updated.id)

            omclcode = query_document_updated.omcl.code
            src_filename = query_document_updated.src_filename
            filename, file_extension = os.path.splitext(src_filename)
            category = query_document_updated.category

            if category == "ANNUAL":
                category = "ANNUAL_REPORT"

            year = self.request.POST.get('q1year')

            # Create the new document title updated by the new year
            new_document_title = f"{year}_{category}_{omclcode}_{checkbox_id_minus_1} - {src_filename}"

            # Create the new document file updated by the new year
            new_document_file = f"omcl_docs/{omclcode}/{year}_{category}_{omclcode}_{checkbox_id_minus_1}{file_extension}"

            # Get file.name in order to rename document file in /media/
            document_path = query_document_updated.file.name

            try:
                actual_document_path = os.path.join(settings.MEDIA_ROOT, document_path)
                new_document_path_temp = settings.MEDIA_ROOT + "/" + new_document_file
                new_document_path = os.rename(actual_document_path, new_document_path_temp)
            except FileNotFoundError:
                messages.error(self.request, _(f"Document {src_filename} doesn't exist in the server"))
                return redirect('manage_doc')
            else:
                # Assign modifications to selected document and save it into the database
                query_document_updated.title = new_document_title
                query_document_updated.file = new_document_file
                query_document_updated.save()
                messages.success(self.request, _(f"The modification has been taken account"))

        context = {
            'form': form,
            'query_omcl': query_omcl,
            'query_document': query_document,
            'query_document_updated': query_document_updated,
        }
        return render(request, self.template_name, context)

我完全不明白,因为我不知道怎样才能把这部分改编成电影回调.py文件:

if not created:
    return

用我的DjangoManageDocView()


Tags: thepath文档selfformidnewdoc