基于词典理解的结构化文本平面词典

text = 'english (fluently), spanish (poorly)' # desired output: {english: fluently, spanish: poorly} # one of my many attempts: dict((language,proficiency.strip('\(\)')) for language,proficiency in lp.split(' ') for lp in text.split(', ')) # but resulting error: NameError: name 'lp' is not defined

# pandas dataframe pd.DataFrame({'language': ['english, spanish (poorly)', 'turkish']}) # desired output: pd.DataFrame({'Language: English': [True, False], 'Language proficiency: English': ['average', pd.NA], 'Language: Spanish': [True, False], 'Language proficiency: Spanish': ['poorly', pd.NA], 'Language: Turkish': [False, True], 'Language proficiency: Turkish': [pd.NA, 'average']}) # my attempt def tidy(content): if pd.isna(content): pass else: dict((language,proficiency.strip('\(\)')) for language,proficiency in lp.split(' ') for lp in text.split(', ')) def tidy_language(language, content): if pd.isna(content): return pd.NA else: if language in content.keys(): return True else: return False def tidy_proficiency(language, content): if pd.isna(content): return pd.NA else: if language in content.keys(): return content.language else: return pd.NA languages = ['english', 'spanish', 'turkish'] df['language'] = df['language'].map(lambda x: tidy(x)) for language in languages: df['Language: {}'.format(language.capitalize())] = df['language'].map(lambda x: tidy_language(language, content) df['Language proficiency: {}'.format(language.capitalize())] = df['language'].map(lambda x: tidy_proficiency(language, content)

3条回答

网友

1楼 · 编辑于 2024-05-16 02:57:49

这里有一个快速的解决方案。将文本馈送到函数

def text_to_dict(text):
    text=text+" "

    new=""
    for alphabet in text:
        if alphabet=="," or alphabet=="(" or alphabet==")":
            continue;
        new+=alphabet

    lis=[]
    temp=""
    for alphabet in new:
        if alphabet==" ":
            if temp[0]==" ":
                temp=temp[1:len(temp)]
            lis.append(temp)
            temp=""
        temp+=alphabet

    dict={}
    for el in lis:
        if lis.index(el)%2==0:
            dict[el]=lis[lis.index(el)+1]

    return dict

if __name__=="__main__":
    text="english (fluently), spanish (poorly), bangla (fluently)"
    print(text_to_dict(text))

网友

2楼 · 编辑于 2024-05-16 02:57:49

您需要反转列表理解中的两个for循环（for循环需要以与编写命令式代码相同的顺序出现）
在.strip('\(\)')中不需要反斜杠
for language,proficiency in lp.split(' ')将尝试将lp.split(' ')的每个项解压到元组(language,proficiency)，因此，将lp.split(' ')包装到一个单元素列表中以实现您想要的：

dict((l,p.strip('()')) for lp in text.split(', ') for l,p in [lp.split(' ')])

{'english': 'fluently', 'spanish': 'poorly'}

以上内容可以写成dict理解：

{l: p.strip('()') for lp in text.split(', ') for l,p in [lp.split(' ')]}

读起来好一点

使用re的替代方法：

import re
dict(re.findall(r'(\w+) \((\w+)\),?', text))

{'english': 'fluently', 'spanish': 'poorly'}

网友
3楼 · 编辑于 2024-05-16 02:57:49

虽然fferri为我的原始问题提供了一些完美的解决方案，但我在数据框架上下文中的最终解决方案更像SuperNoob的建议

我的最终解决方案：

# Create a parser function to form a dictionary of language: proficiency pairs from the values in the 'Speaks' column.
def parse_dictionary(content):
    if pd.isna(content):
        pass
    else:
        d = {}
        lps = content.split(', ')
        for lp in lps:
            if '(' not in lp:
                l = lp
                p = pd.NA
            else:
                l, p = lp.split('(')
                l = l.strip().capitalize()
                p = p.strip('()')
            d[l] = p
        return d
    
# Create a parser function to return the languages fom the dictionary in the 'Speaks' column.    
def parse_language(language, d):      
    if pd.isna(d):
        pass
    else:
        if language in d.keys():
            return True
        else:
            return False
        
# Create a parser function to return the language proficiencies fom the dictionary in the 'Speaks' column.
def parse_proficiency(language, d):   
    if pd.isna(d):
        pass
    else:
        if language in d.keys():
            return d[language]
        else:
            return pd.NA

# Parse the values in the 'Speaks' column to create a dictionary of language: proficiency pairs.
df['Speaks'] = df['Speaks'].map(lambda x: parse_dictionary(x))  

# Parse the values in the 'Speaks' column to create seperate 'language' columns with True-False values.
for language in languages:
    df['Language: {}'.format(language)] = df['Speaks'].apply(lambda d: parse_language(language, d))

# Parse the values in the 'Speaks' column to create seperate 'Language proficiency' columns with proficiency values.
for language in languages:
    df['Language proficiency: {}'.format(language)] = df['Speaks'].apply(lambda d: parse_proficiency(language, d))

相关问题更多 >

编程相关推荐

热门问题

热门文章