从dta-fi看python俄语

2024-04-29 08:24:51 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试从.dta文件中读取俄语字符,当我打印到终端时,我看到以下内容: Яномамо

这看起来不像俄语,所以我想知道我要做什么才能得到真正的俄语字符。我尝试了以下方法,但没有成功: ex = row['name_rus'].encode("cp1251")

我得到:UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-2: character maps to <undefined>

任何指导都将不胜感激!我把我的剧本附在下面,以防万一。你知道吗

import pandas as pd
import re

def load_large_dta(fname):
    import sys

    reader = pd.read_stata(fname, iterator=True)
    df = pd.DataFrame()

    try:
        chunk = reader.get_chunk(100*1000)
        while len(chunk) > 0:
            df = df.append(chunk, ignore_index=True)
            chunk = reader.get_chunk(100*1000)
            print ('.')
            sys.stdout.flush()
    except (StopIteration, KeyboardInterrupt):
        pass

    print ('\nloaded {} rows'.format(len(df)))

    return df



def store_oid(df1, df2, df3):
    mapping = {}
    to_write = {}
    for index, row in df1.iterrows():
        russian = row['id']
        mapping[russian] = index
    for index, row in df2.iterrows():
        russian_words = row['name_rus']
        russian_words = [x.strip() for x in russian_words.split(',')]
        for word in russian_words: 
            if word in mapping:
                oid = int(row['oid'])
                to_write[word] = oid
    for index, row in df3.iterrows():
        ex = row['name_rus'].encode("cp1251")
        print(ex)
def main():
    long_words = load_large_dta('russian_english_names.dta')
    single_words = load_large_dta('Ruthenia2_duplicates.dta')
    tradition_english = load_large_dta('tradition_english_russian_EA_Augu16th.dta')
    store_oid(single_words, long_words, tradition_english)

if __name__ == '__main__':
    main()

Tags: nameindfforindexenglishloadex