str.split（）返回一个AttributeError:'NoneType'对象没有属性'split'

import MeCab #opening the file containing the long string with Japanese text file = open('output_text.txt') str_text = file.read() #passing string into the MeCab tokenizer/tagger and splitting the long string into a list based on tagger = MeCab.Tagger() words = tagger.parse(str_text[:25]).split('\n')[:-2] #last two entries are just some tagger info for word in words: temp_str = word.split('\t') print(temp_str)

['この', '連体詞,*,*,*,*,*,この,コノ,コノ'] ['ページ', '名詞,一般,*,*,*,*,ページ,ページ,ページ'] ['は', '助詞,係助詞,*,*,*,*,は,ハ,ワ'] ['以下', '名詞,非自立,副詞可能,*,*,*,以下,イカ,イカ'] ['に', '助詞,格助詞,一般,*,*,*,に,ニ,ニ'] ['ある', '動詞,自立,*,*,五段・ラ行,基本形,ある,アル,アル'] ['削除', '名詞,サ変接続,*,*,*,*,削除,サクジョ,サクジョ'] ['依頼', '名詞,サ変接続,*,*,*,*,依頼,イライ,イライ'] ['の', '助詞,連体化,*,*,*,*,の,ノ,ノ'] ['議論', '名詞,サ変接続,*,*,*,*,議論,ギロン,ギロン'] ['を', '助詞,格助詞,一般,*,*,*,を,ヲ,ヲ'] ['保存', '名詞,サ変接続,*,*,*,*,保存,ホゾン,ホゾン'] ['し', '動詞,自立,*,*,サ変・スル,連用形,する,シ,シ'] ['た', '助動詞,*,*,*,特殊・タ,基本形,た,タ,タ'] ['もの', '名詞,非自立,一般,*,*,*,もの,モノ,モノ']

%%time #load the txt file with Japanese characters: file = open('output_text.txt') str_text = file.read() #boundries for the text blocks used in the below for loop lower = 0 upper = 100000 #dictionary for words and kanji characters counts_words = dict() counts_kanji = dict() word_counter = 0 #tokenizer/tagger tagger = MeCab.Tagger() #splits strings into a list, used for words that have more than one character to get individual characters def splitter(word): return list(word) #break condition for the loop condition = 'no' while True: if condition == 'yes': break #this is for the last block of 100k increments elif lower > 133400001: #initiate break condition condition = 'yes' words = tagger.parse(str_text[lower:]).split('\n')[:-2] print('Last block, chief!',lower,':',upper) lower+=100000 upper+=100000 for word in words: temp_str = word.split('\t') word_counter+=1 counts_words[temp_str[0]+' '+temp_str[1]] = counts_words.get(temp_str[0]+' '+temp_str[1], 0) + 1 if len(temp_str[0])>1: for i in splitter(temp_str[0]): counts_kanji[i] = counts_kanji.get(i, 0) + 1 break else: counts_kanji[temp_str[0]] = counts_kanji.get(temp_str[0], 0) + 1 break else: #pass string 100k long string block into a tokenizer/tagger words = tagger.parse(str_text[lower:upper]).split('\n')[:-2] #increment the lower and upper boundries of the str blocks lower+=100000 upper+=100000 #iterate through each word parsed by the tokenizer for word in words: temp_str = word.split('\t') #split each word data by tab, [word, info] word_counter+=1 #count number of words #check if the entry exists in the words dict, either add or increment the counts counts_words[temp_str[0]+' '+temp_str[1]] = counts_words.get(temp_str[0]+' '+temp_str[1], 0) + 1 #check if the word has more than one character, if yes split it and add each character to the kanji dict if len(temp_str[0])>1: for i in splitter(temp_str[0]): #check if the entry exists in the words dict, either add or increment the counts counts_kanji[i] = counts_kanji.get(i, 0) + 1 else: counts_kanji[temp_str[0]] = counts_kanji.get(temp_str[0], 0) + 1

1条回答

网友

1楼 · 发布于 2024-06-16 11:17:20

我是mecab-python3的开发者

我想你可能已经就此给我发了邮件，但请不要传递MeCab 1M字符串。它是在假设输入是一个句子的情况下发展起来的。它很健壮，并且可以处理更长的字符串——例如，你在处理段落时不会遇到问题——但你基本上是在毫无益处地进入未经测试的领域

在将输入文本传递给MeCab之前，将其拆分为段落或句子

此外，关于这一点：

I could potentially pass the string in 1M chunks but it feels wrong losing that much data when there might be a solution somewhere.

传递较短的字符串不会丢失任何数据。我不知道你指的是什么

相关问题更多 >

编程相关推荐

热门问题

热门文章