python中的字对齐内存错误

2024-04-20 09:01:16 发布

您现在位置:Python中文网/ 问答频道 /正文

******** Read from corpus ********************
NEWRESULT
Traceback (most recent call last):
File "em2.py", line 168, in <module>
main()
File "em2.py", line 165, in main
print  em_run(NEWSENTENCES)
File "em2.py", line 54, in em_run
for source, target in sentence_pairs] 
MemoryError

上面的错误是在我尝试运行以下程序时得到的。有人能帮我解决这个错误吗?你知道吗

下面的程序获取两个文本文件“sample1”和“sample2”“样本1”包含马拉雅拉姆语的一个句子,句子中有14个单词“示例2”包含相应的英文翻译,其中包含23个单词。该程序是采取两种语言之间的对应词对齐。你知道吗

实际上,这个程序适用于最多10个单词的句子。当单词超过10时,问题就出现了。你知道吗

#!/usr/bin/env python

from itertools import izip
from collections import defaultdict
import copy
import itertools
import operator
import codecs
import StringIO


def em_run(sentence_pairs):

#Run expectation-maximization on a list of pairs of the form
# `(source_tokens, target_tokens)`
# where `source_tokens` is a list of tokens in the source language and
#`target_tokens` is a list of tokens for a translationally equivalent
#sentence in the target language.
#Returns a mapping `(t1, t2) => p` where `t1` is a source-language
#token, `t2` is a target-language token, and the value `p` represents
#$P(t1|t2)$.


source_sentences, target_sentences = zip(*sentence_pairs)
#print " \n SOURCE SENTENCE"
#print "\n", source_sentences
#print "\n TARGET SENTENCE"
#print "\n", target_sentences

source_vocabulary = set(itertools.chain.from_iterable(source_sentences))
#print "\nSOURCE VOCABULARY"
#print "\n", source_vocabulary

target_vocabulary = set(itertools.chain.from_iterable(target_sentences))
#print "\n TARGET VOCABULARY"
#print "\n", target_vocabulary

# Value with which to initialize each conditional probability
uniform_prob = 1.0 / len(source_vocabulary)
#print len(source_vocabulary)
#print "\n INITIAL PROBABILITY VALUE\n"
#print uniform_prob

conditional_probs_old = None
conditional_probs = { (source_w, target_w) : uniform_prob 
                                                for source_w in source_vocabulary
                                                    for target_w in target_vocabulary}
#print "\n INITIAL CONDITIONAL PROBABILITY VALUE"
#print "\n", conditional_probs

#alignments = [(source,target) for source,target in sentence_pairs ]
alignments = [[zip(source, target_perm)
                        for target_perm in itertools.permutations(target)]
                            for source, target in sentence_pairs] 
#print "\n SET OF ALL POSSIBLE ALIGNMENTS\n"
#print alignments

'''fw=codecs.open('file_mal.txt','w','utf8')
fw.write(str(alignments))
fw.close()'''
# Repeat until convergence
i = 0
while conditional_probs_old != conditional_probs:
    conditional_probs_old = copy.copy(conditional_probs)
    alignment_probs = {
        i: {
            tuple(alignment):
            reduce(operator.mul, [conditional_probs[pair]
                for pair in alignment])
            for alignment in sentence_alignments
       }

for i, sentence_alignments in enumerate(alignments)
}



# Normalize alignment probabilities
for sentence_idx, sentence_alignments in alignment_probs.iteritems():
    total = float(sum(sentence_alignments.values()))
    probs = {alignment: value / total
        for alignment, value in sentence_alignments.iteritems()}
    alignment_probs[sentence_idx] = probs

# Now join all alignments and begin the maximization step: group
# by target-language word and collect corresponding
# source-language probabilities
word_translations = defaultdict(lambda: defaultdict(float))
for sentence_alignments in alignment_probs.itervalues():
    for word_pairs, prob in sentence_alignments.iteritems():
        for source_word, target_word in word_pairs:
    #print source_word
            word_translations[target_word][source_word] += prob

# Now calculate new conditional probability mapping, ungrouping
# the `word_translations` tree and normalizing values into
# conditional probabilities
st = []
words=[]
conditional_probs = {}

for target_word, translations in word_translations.iteritems():
    total = float(sum(translations.values()))
    for source_word, score in translations.iteritems():
        conditional_probs[source_word, target_word] = score / total
#print conditional_probs

for key,value in conditional_probs.iteritems():
    #print key[0]
    words.append(key[0])
    #for key,value in conditional_probs.iteritems():
#print words 
st=set(words)
#print st
for i in st:
    final(i,conditional_probs)


with codecs.open ('output_data.txt', 'w','utf8') as fp:
    for key,value in conditional_probs.iteritems():
        fp.write(key[0].decode('utf8')+" , "+key[1]+" : "+str(value))
    fp.write("\n")
return conditional_probs

def final(i,conditional_probs):
lst = []
output=StringIO.StringIO()
ha=[]
val=0
val1=0
#print i
fd=open('final.txt','a')
for key,value in conditional_probs.iteritems():
    if i in key[0]:
        val1=value
        if val<val1:
            lst=value
            key2=key[0]
            key3=key[1]
            val=value

        ha.append(value)
output.write(key2 + "," + key3 + '=' + str(val) + "   \n")
contents=output.getvalue()

fd.write(contents)
fd.close()
output.close() 
#print lst
#print ha

def main():

print "******** Read from corpus ********************"
NEWSENTENCES = []
with open("sample1") as textMal, open("sample2",) as textEn:
    for x, y in izip(textMal, textEn):
        x = x.strip().split()
        y = y.strip().split()
        NEWSENTENCES.append((x, y))
        #print x
    #print y

print "NEWRESULT"
print  em_run(NEWSENTENCES)

if __name__ == '__main__':
main()

样本1

ദ;അതു വെള്ളത്തിന്നും വെള്ളത്തിന്നും തമ്മിൽ വേർപിരിവായിരിക്കട്ടെ എന്നു കല്പിച്ചു. 你知道吗

样本2

神说,水中间要有穹苍,将水与水分开。你知道吗


Tags: keyinsourcetargetforvaluesentenceword