<p>它没有经过任何测试,也不会赢得任何速度比赛:</p>
<pre><code>getOftenOccuringPatterns <- function(column, atleaststringsize, atleasttimes, uniqueInColumns = FALSE){
res <-
lapply(column,function(x){
lapply(atleaststringsize:nchar(x),function(y){
if(uniqueInColumns){
unique(substring(x, 1:(nchar(x)-y+1), y:nchar(x)))
}else{
substring(x, 1:(nchar(x)-y+1), y:nchar(x))
}
})
})
orderedRes <- unlist(res)[order(unlist(res))]
encodedRes <- rle(orderedRes)
partRes <- with(encodedRes, {check = (lengths >= atleasttimes);
list(what = values[check], times = lengths[check])})
testRes <- sapply(partRes$what, function(x){length(grep(x, partRes$what)) > 1})
lapply(partRes, '[', !testRes)
}
column <- c("bla1okay", "okay1243bla", "blaokay", "bla12okay", "okaybla")
getOftenOccuringPatterns(column, atleaststringsize=3, atleasttimes=4)
$what
"bla" "okay"
$times
5 5
getOftenOccuringPatterns(c("aaaaaaaa", "aaaaaaa", "aaaaaa", "aaaaa", "aaaa", "aaa"), atleaststringsize=3, atleasttimes=4)
$what
[1] "aaaaaa"
$times
[1] 6
getOftenOccuringPatterns(c("aaaaaaaa", "aaaaaaa", "aaaaaa", "aaaaa", "aaaa", "aaa"), atleaststringsize=3, atleasttimes=4, uniqueInColumn = TRUE)
$what
[1] "aaaaa"
$times
[1] 4
</code></pre>