匹配词，与cas无关

> df Id Clean_Data 1918916 Luxury Apartments consisting 11 towers Well equipped gymnasium Swimming Pool Toddler Pool Health Club Steam Room Sauna Jacuzzi Pool Table Chess Billiards room Carom Table Tennis indoor games 1495638 near medavakkam junction calm area near global hospital 1050651 No Pre Emi No Booking Amount No Floor Rise Charges No Processing Fee HLPROJECT HIGHLIGHTS

df['one_word_tokenized_text'] =df["Clean_Data"].str.split() df['bigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 2))) df['trigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 3))) df['four_words'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 4))) token=pd.Series(df["one_word_tokenized_text"]) Lid=pd.Series(df["Id"]) matches= token.apply(lambda x: pd.Series(x).str.extractall("|".join(["({})".format(cat) for cat in Categories.HealthCare]))) match_list= [[m for m in match.values.ravel() if isinstance(m, str)] for match in matches] match_df = pd.DataFrame({"ID":Lid,"jc1":match_list}) def match_word(feature, row): categories = [] for bigram in row.bigram: joined = ' '.join(bigram) if joined in feature: categories.append(joined) for trigram in row.trigram: joined = ' '.join(trigram) if joined in feature: categories.append(joined) for fourwords in row.four_words: joined = ' '.join(fourwords) if joined in feature: categories.append(joined) return categories match_df['Health1'] = df.apply(partial(match_word, HealthCare), axis=1) match_df['HealthCare'] = match_df[match_df.columns[[1,2]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

category = [('steam room','IN','HealthCare'), ('sauna','IN','HealthCare'), ('Jacuzzi','IN','HealthCare'), ('Aerobics','IN','HealthCare'), ('yoga room','IN','HealthCare'),] HealthCare= [e1 for (e1, rel, e2) in category if e2=='HealthCare']

1条回答

网友

1楼 · 发布于 2024-06-16 11:48:56

编辑2：仅类别.py已更新

你知道吗类别.py你知道吗

category = [('steam room','IN','HealthCare'),
        ('sauna','IN','HealthCare'),
        ('jacuzzi','IN','HealthCare'),
        ('aerobics','IN','HealthCare'),
        ('Yoga room','IN','HealthCare'),
        ('booking','IN','HealthCare'),        
        ]
category1 = [value[0].capitalize() for index, value in enumerate(category)]
category2 = [value[0].lower() for index, value in enumerate(category)]

test = []
test2 =[]

for index, value in enumerate(category1):
    test.append((value, category[index][1],category[index][2])) 

for index, value in enumerate(category2):
    test2.append((value, category[index][1],category[index][2]))

category = category + test + test2


HealthCare = [e1 for (e1, rel, e2) in category if e2=='HealthCare']

您未更改的数据集

import pandas as pd
from nltk import ngrams, word_tokenize
import Categories
from Categories import *
from functools import partial


data = {'Clean_Data':['Luxury Apartments consisting 11 towers Well equipped gymnasium Swimming Pool Toddler Pool Health Club Steam Room Sauna Jacuzzi Pool Table Chess Billiards room Carom Table Tennis indoor games',
                     'near medavakkam junction calm area near global hospital',
                     'No Pre Emi No Booking Amount No Floor Rise Charges No Processing Fee HLPROJECT HIGHLIGHTS '],
'Id' : [1918916, 1495638,1050651]}

df = pd.DataFrame(data)


df['one_word_tokenized_text'] =df["Clean_Data"].str.split()
df['bigram'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 2)))
df['trigram'] = df['Clean_Data']).apply(lambda row: list(ngrams(word_tokenize(row), 3)))
df['four_words'] = df['Clean_Data'].apply(lambda row: list(ngrams(word_tokenize(row), 4)))
token=pd.Series(df["one_word_tokenized_text"])
Lid=pd.Series(df["Id"])
matches= token.apply(lambda x: pd.Series(x).str.extractall("|".join(["({})".format(cat) for cat in Categories.HealthCare])))
match_list= [[m for m in match.values.ravel() if isinstance(m, str)] for match in matches]
match_df = pd.DataFrame({"ID":Lid,"jc1":match_list})


def match_word(feature, row):
    categories = []

    for bigram in row.bigram:
        joined = ' '.join(bigram)
        if joined in feature:
            categories.append(joined)
    for trigram in row.trigram:
        joined = ' '.join(trigram)
        if joined in feature:
            categories.append(joined)
    for fourwords in row.four_words:
        joined = ' '.join(fourwords)
        if joined in feature:
            categories.append(joined)
    return categories

match_df['Health1'] = df.apply(partial(match_word, HealthCare), axis=1)
match_df['HealthCare'] = match_df[match_df.columns[[1,2]]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)enize(row), 4)))

输出

print match_df 

+    +        +      -+                  +
|ID      |jc1             |Health1      |HealthCare                          |
+    +        +      -+                  +
|1918916 |[sauna, jacuzzi]|             |['sauna', 'jacuzzi'],['steam room'] |
+    +        +      -+                  +
|1495638 |                |             |                                    |
+    +        +      -+                  +
|1050651 |    [Booking]   |             |  ['Booking'],[]                    |                |
+    +        +      -+                  +

你知道吗类别.py你知道吗

您未更改的数据集

输出

相关问题更多 >

编程相关推荐

热门问题

热门文章