有效地合并子字符串上的表，而不是完美匹配

library(SPARQL) library(parallel) library(Hmisc) library(tidyr) library(dplyr) my.endpoint <- "http://sparql.hegroup.org/sparql/" go.query <- 'select * where { graph <http://purl.obolibrary.org/obo/merged/GO> { ?goid <http://www.geneontology.org/formats/oboInOwl#hasOBONamespace> "biological_process"^^<http://www.w3.org/2001/XMLSchema#string> . ?goid rdfs:label ?goterm}}' go.result <- SPARQL(url = my.endpoint, query = go.query) go.result.frame <- go.result[[1]] anat.query <- 'select distinct ?anatterm ?anatid where { graph <http://purl.obolibrary.org/obo/merged/UBERON> { ?anatid <http://www.geneontology.org/formats/oboInOwl#hasDbXref> ?xr . ?anatid rdfs:label ?anatterm}}' anat.result <- SPARQL(url = my.endpoint, query = anat.query) anat.result.frame <- anat.result[[1]] # slow but recognizes multi-word substrings loop.solution <- mclapply( X = sort(anat.result.frame$anatid), mc.cores = 7, FUN = function(one.anat.id) { one.anat.term <- anat.result.frame$anatterm[anat.result.frame$anatid == one.anat.id] temp <- grepl(pattern = paste0('\\b', one.anat.term, '\\b'), x = go.result.frame$goterm) temp <- go.result.frame[temp , ] if (nrow(temp) > 0) { temp$anatterm <- one.anat.term temp$anatid <- one.anat.id return(temp) } } ) loop.solution <- do.call(rbind, loop.solution) # from Brandon # fast, but doesn't recognize multi-word matches sep.gather.soln <- separate(go.result.frame, goterm, letters, sep = " ", remove = FALSE) %>% gather(goid, goterm) %>% na.omit() %>% setNames(c("goid", "goterm", "code", "anatterm")) %>% select(goid, goterm, anatterm) %>% left_join(anat.result.frame) %>% na.omit()

2条回答

网友

1楼 · 编辑于 2024-05-13 23:25:52

我正在使用您的原始帖子数据。
第一个拆分项
再次检查字典中的关联项
三合一

terms =["cheese omelette","turkey sandwich","bean soup",]
dictionary ={'turkey': 'meat', 'cheese': 'dairy', 'sandwich': 'bread', 'beef': 'meat', 'omelette': 'eggs', 'bean': 'legume', 'carrot': 'vegetable', 'milk': 'dairy'}

res = set( term +' '+ cat for term in terms for cat in set([ dictionary .get(word,'') for word in term.split()]) if cat) 

for i in res:
    print i


output:
cheese omelette dairy
bean soup legume
turkey sandwich meat
turkey sandwich bread
cheese omelette eggs

网友

2楼 · 编辑于 2024-05-13 23:25:52

library(tidyr)
library(dplyr)

df1 <- data.frame(
  mealtime = c("breakfast","lunch","dinner","dinner"),
  dish = c(
    "cheese omelette",
    "turkey sandwich",
    "bean soup",
    "something very long like this")
)

df2 <- read.table(textConnection(
'ingredient  category
bean        legume
beef        meat
carrot      vegetable
cheese      dairy
milk        dairy
omelette    eggs
sandwich    bread
turkey      meat'), header = TRUE)

df1 <- separate(df1, dish, letters, sep = " ", remove = FALSE) %>% 
  gather(mealtime, dish) %>% 
  na.omit() %>% setNames(c("mealtime","dish","code","ingredient")) %>% 
  select(mealtime, dish, ingredient) %>% 
  left_join(df2) %>% na.omit()

df1

mealtime dish ingredient category 1 breakfast cheese omelette cheese dairy 2 lunch turkey sandwich turkey meat 3 dinner bean soup bean legume 5 breakfast cheese omelette omelette eggs 6 lunch turkey sandwich sandwich bread

相关问题更多 >

编程相关推荐

热门问题

热门文章