如何将列表格式更改为文本文件,并将其作为参数传递给python中定义的函数?

2024-04-18 19:27:55 发布

您现在位置:Python中文网/ 问答频道 /正文

如何将两个文本文件作为参数传递给定义的函数,而不是将列表作为参数传递?你知道吗

我的代码包含在一个列表中定义的三对句子。它作为参数传递给函数em_run。你知道吗

现在我需要读入两个语料库,即两个单独的文本文件来读入函数,而不是这三对句子。你知道吗

这是我的密码:

#!/usr/bin/env python
"""An implementation of the IBM Model 1 expectation-maximization algorithm  for learning word alignments."""

from collections import defaultdict
import copy
import itertools
import operator


def em_run(sentence_pairs):
#Run expectation-maximization on a list of pairs of the form
# `(source_tokens, target_tokens)`
# where `source_tokens` is a list of tokens in the source language and
#`target_tokens` is a list of tokens for a translationally equivalent
#sentence in the target language.
#Returns a mapping `(t1, t2) => p` where `t1` is a source-language
#token, `t2` is a target-language token, and the value `p` represents
#$P(t1|t2)$.


source_sentences, target_sentences = zip(*sentence_pairs)
source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
target_vocabulary = set(itertools.chain.from_iterable(target_sentences))

# Value with which to initialize each conditional probability
uniform_prob = 1.0 / len(source_vocabulary)

conditional_probs_old = None
conditional_probs = {(source_w, target_w): uniform_prob
    for source_w in source_vocabulary
    for target_w in target_vocabulary}

alignments = [[zip(source, target_perm)
    for target_perm in itertools.permutations(target)]
    for source, target in sentence_pairs] 

# Repeat until convergence
i = 0
while conditional_probs_old != conditional_probs:
    conditional_probs_old = copy.copy(conditional_probs)

    alignment_probs = {
        i: {
            tuple(alignment):
            reduce(operator.mul, [conditional_probs[pair]
            for pair in alignment])
            for alignment in sentence_alignments
           }

    for i, sentence_alignments in enumerate(alignments)
    }

# Normalize alignment probabilities
for sentence_idx, sentence_alignments in alignment_probs.iteritems():
    total = float(sum(sentence_alignments.values()))
    probs = {alignment: value / total
        for alignment, value in sentence_alignments.iteritems()}
    alignment_probs[sentence_idx] = probs

# Now join all alignments and begin the maximization step: group
# by target-language word and collect corresponding
# source-language probabilities
word_translations = defaultdict(lambda: defaultdict(float))
for sentence_alignments in alignment_probs.itervalues():
    for word_pairs, prob in sentence_alignments.iteritems():
        for source_word, target_word in word_pairs:
            word_translations[target_word][source_word] += prob

# Now calculate new conditional probability mapping, ungrouping
# the `word_translations` tree and normalizing values into
# conditional probabilities
conditional_probs = {}
for target_word, translations in word_translations.iteritems():
    total = float(sum(translations.values()))
    for source_word, score in translations.iteritems():
        conditional_probs[source_word, target_word] = score / total

return conditional_probs


def main():
    SENTENCES = [
            ('mi casa verde'.split(), 'my green house'.split()),
            ('casa verde'.split(), 'green house'.split()),
            ('la casa'.split(), 'the house'.split()),
            ]
    print em_run(SENTENCES)

if __name__ == '__main__':
    main()

Tags: theinsourcetargetforlanguagesentenceword
2条回答

如果每个文件都包含句子对的一部分,并且第一个文件的每一行对应第二个文件中的同一行,那么您只需open将这些文件和zip放在一起:

em_run(zip(open('file1'), open('file2')))

我看到两种方法:

  1. 使用persistence。如果你用这种方法,你可以保存任何对象到某个存储i.g.文件。但并非总是可以手动更改。你知道吗
  2. 将字符串表示形式写入文件并对其进行解析。对于简单的对象,如数字、字符串等,它是一种很好的方法,但也需要手动解析。你知道吗

如果从文件中读取数据。。。假设它有以下格式:

英语

my green house
green house
the house

马拉雅拉姆语

mi casa verde
casa verde
la casa

在代码下面,改为文件lists

#!/usr/bin/env python

from itertools import izip
from collections import defaultdict
import copy
import itertools
import operator


def em_run(sentence_pairs):
    #Run expectation-maximization on a list of pairs of the form
    # `(source_tokens, target_tokens)`
    # where `source_tokens` is a list of tokens in the source language and
    #`target_tokens` is a list of tokens for a translationally equivalent
    #sentence in the target language.
    #Returns a mapping `(t1, t2) => p` where `t1` is a source-language
    #token, `t2` is a target-language token, and the value `p` represents
    #$P(t1|t2)$.


    source_sentences, target_sentences = zip(*sentence_pairs)
    source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
    target_vocabulary = set(itertools.chain.from_iterable(target_sentences))

    # Value with which to initialize each conditional probability
    uniform_prob = 1.0 / len(source_vocabulary)

    conditional_probs_old = None
    conditional_probs = {(source_w, target_w): uniform_prob
        for source_w in source_vocabulary
        for target_w in target_vocabulary}

    alignments = [[zip(source, target_perm)
        for target_perm in itertools.permutations(target)]
        for source, target in sentence_pairs] 

    # Repeat until convergence
    i = 0
    while conditional_probs_old != conditional_probs:
        conditional_probs_old = copy.copy(conditional_probs)

        alignment_probs = {
            i: {
                tuple(alignment):
                reduce(operator.mul, [conditional_probs[pair]
                    for pair in alignment])
                for alignment in sentence_alignments
           }

    for i, sentence_alignments in enumerate(alignments)
    }

    # Normalize alignment probabilities
    for sentence_idx, sentence_alignments in alignment_probs.iteritems():
        total = float(sum(sentence_alignments.values()))
        probs = {alignment: value / total
            for alignment, value in sentence_alignments.iteritems()}
        alignment_probs[sentence_idx] = probs

    # Now join all alignments and begin the maximization step: group
    # by target-language word and collect corresponding
    # source-language probabilities
    word_translations = defaultdict(lambda: defaultdict(float))
    for sentence_alignments in alignment_probs.itervalues():
        for word_pairs, prob in sentence_alignments.iteritems():
            for source_word, target_word in word_pairs:
                word_translations[target_word][source_word] += prob

    # Now calculate new conditional probability mapping, ungrouping
    # the `word_translations` tree and normalizing values into
    # conditional probabilities
    conditional_probs = {}
    for target_word, translations in word_translations.iteritems():
        total = float(sum(translations.values()))
        for source_word, score in translations.iteritems():
            conditional_probs[source_word, target_word] = score / total

    return conditional_probs


def main():
    SENTENCES = [
    ('mi casa verde'.split(), 'my green house'.split()),
    ('casa verde'.split(), 'green house'.split()),
    ('la casa'.split(), 'the house'.split()),
            ]
    print "Original SENTENCES"
    print "Original results" em_run(SENTENCES)
    print "******** Read words from files ********************"
    NEWSENTENCES = []
    with open("datafile_english") as textEn, open("datafile_malayalam") as textMal:
        for x, y in izip(textEn, textMal):
            x = x.strip().split()
            y = y.strip().split()
            NEWSENTENCES.append((y, x))
    print "NEWRESULT", em_run(NEWSENTENCES)
if __name__ == '__main__':
    main()

输出:

Original SENTENCES
Original results {('mi', 'green'): 0.16666666666666669, ('verde', 'my'): 0.3333333333333333, ('la', 'the'): 0.5, ('mi', 'my'): 0.3333333333333333, ('mi', 'house'): 0.1111111111111111, ('casa', 'the'): 0.5, ('casa', 'my'): 0.3333333333333333, ('verde', 'house'): 0.27777777777777773, ('casa', 'house'): 0.4444444444444444, ('casa', 'green'): 0.4166666666666667, ('verde', 'green'): 0.4166666666666667, ('la', 'house'): 0.16666666666666666}
**** Read words from file ************************
NEWRESULT {('mi', 'green'): 0.16666666666666669, ('verde', 'my'): 0.3333333333333333, ('la', 'the'): 0.5, ('mi', 'my'): 0.3333333333333333, ('mi', 'house'): 0.1111111111111111, ('casa', 'the'): 0.5, ('casa', 'my'): 0.3333333333333333, ('verde', 'house'): 0.27777777777777773, ('casa', 'house'): 0.4444444444444444, ('casa', 'green'): 0.4166666666666667, ('verde', 'green'): 0.4166666666666667, ('la', 'house'): 0.16666666666666666}

相关问题 更多 >