提高Python谜题的性能
我收到一个谜题来面试,结果昨天得知我没有被选上(因为我没能让谜题表现得很好)。我想知道有没有人能帮我解决这个谜题,让我表现得更好。这个谜题是用Python写的,虽然我一两年前上过两节Python课,但相比我之前工作过的领域(18年的嵌入式C语言),我还是比较新手。任何帮助或建议都很感激,这样我可以从中学习。这个谜题是为了面试考虑而提交的。
谜题挑战描述如下:
把“单词”看作是任何由大写字母A-Z组成的序列(不局限于“字典单词”)。对于任何包含至少两个不同字母的单词,可以用相同的字母但不同的顺序组成其他单词(例如,STATIONARILY和ANTIROYALIST这两个都是字典单词;在我们这里,“AAIILNORSTTY”也是由这两个单词的字母组成的“单词”)。我们可以给每个单词分配一个数字,基于它在按字母顺序排列的所有由相同字母组成的单词列表中的位置。生成整个单词列表并找到所需的单词是一种方法,但如果单词很长,这样做会很慢。请写一个程序,接受一个单词作为命令行参数,并将其编号打印到标准输出。不要使用上述生成整个列表的方法。你的程序应该能够接受任何长度不超过20个字母的单词(可能有一些字母重复),并且使用的内存不超过1GB,运行时间不超过500毫秒。我们检查的任何答案都适合64位整数。
示例单词及其排名:
ABAB = 2 AAAB = 1 BAAA = 4 QUESTION = 24572 BOOKKEEPER = 10743 NONINTUITIVENESS = 8222334634
你的程序将根据运行速度和代码的清晰度进行评判。我们会运行你的程序,也会阅读源代码,所以你能做的任何让这个过程更简单的事情都会受到欢迎。
运行这个谜题的方法:你可以在命令行输入一个单词(这是当前的状态),或者如果你想从文件中读取上面提供的单词,可以注释掉raw_input
以接受一个单词,然后取消注释那段代码来读取words.txt
文件。
在程序的主要部分:
从命令行逐个输入单词 - 当前代码状态 - 将接受你的命令行单词输入
getInputFromCommandLine()
--这样运行:命令行:python athenaPuzzleIterDeep.py
如果你想从words.txt
文件读取输入,请取消注释以下内容,words.txt
将与代码一起提供
--这样运行:命令行:python athenaPuzzleIterDeep.py
--但你还必须确保words.txt
文件与Python程序在同一目录下
wordList = loadWords()
wordNumberOrdering(wordList)
尝试过的性能提升方法,但效果不够好:迭代加深: 尝试使用迭代加深来结合深度优先搜索(DFS)的空间优势和广度优先搜索(BFS)的时间和浅层解决方案优势。因此可以尝试在深度限制下运行DFS:先试深度为1,然后是2、3……等等。所以不构建整个图,而是在每个树的层级调用DFS来查看是否找到了解决方案。DFS会优先搜索树的左侧子节点,但最终会搜索每个节点,所以耗时较长而占用空间不多。然而,如果你使用BFS的层级限制思想,只按层构建树,然后用DFS搜索,这就是迭代加深的思路。
迭代加深并没有提供所需的性能提升。我还尝试引入优先队列的Python库,但在我的Linux版本上无法正确安装。
Words.txt文件内容:
ABAB
AAAB
BAAA
QUESTION
ABCDEFGHIJKLMNOPQRSTUVWXYZ
BOOKKEEPER
BOOKKEEPERS
STATIONARILY
NONINTUITIVENESS
这是代码:
import random
import string
from math import factorial
import itertools
from functools import update_wrapper
import time
import sys
sys.setrecursionlimit(5000)
# works for functions with hashable (immuatble) arguments
# Example usage: permutations = memoize(itertools.permutations)
ALPHABET_LETTERS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
globalMemCache = {}
def memoize(f):
# define "wrapper" function that checks cache for
# previously computed answer, only calling f if this
# is a new problem.
def memf(*x):
permutationsInCache = []
if x not in memf.cache:
memf.cache[x] = f(*x)
return memf.cache[x]
# initialize wrapper function's cache. store cache as
# attribute of function so we can look at its value.
memf.cache = globalMemCache
return memf
def isValidWord(word):
lenWord = len(word)
if (lenWord > 20):
print "word > 20 letters is NOT acceptable as input"
print " "
return False
elif (lenWord >= 11):
print "word >= 11 letters is NOT acceptable as input for this current iterative deepening solution"
print "my iterative deepening solution takes too much time and space for words >= 11 letters"
print " "
return False
wordInAlphabet = True
for letter in word:
if (wordInAlphabet != True) or (letter not in ALPHABET_LETTERS):
wordInAlphabet = False
return wordInAlphabet
permutationsMemoized = memoize(itertools.permutations)
WORDLIST_FILENAME = "words.txt"
def loadWords():
print "Loading word list from file..."
inFile = open(WORDLIST_FILENAME, 'r', 0)
wordList = []
for line in inFile:
wordList.append(line.strip().lower())
print " ", len(wordList), "words loaded."
return wordList
def remove_duplicates(l):
return list(set(l))
def printPath(path):
result = ''
for i in range(len(path)):
if i == len(path) - 1:
result = result + str(path[i])
else:
result = result + str(path[i]) + '->'
return result
class Node(object):
def __init__(self, name, index):
self.name = str(name)
self.index = index
def getName(self):
return self.name
def getIndex(self):
return self.index
def __str__(self):
return self.name
class Edge(object):
def __init__(self, src, dest):
self.src = src
self.dest = dest
def getSource(self):
return self.src
def getDestination(self):
return self.dest
def __str__(self):
return str(self.src) + '->' + str(self.dest)
class Queue:
def __init__(self):
self.list = []
def push(self,item):
self.list.insert(0,item)
def pop(self):
return self.list.pop()
def isEmpty(self):
return len(self.list) == 0
def DFSShortest(graph, start, end, path = [], shortest = None, index = 1000):
newGraph = graph
path = path + [start]
if str(start) == str(end):
index = start.index
newPath = path
return newPath,index
else:
anyChildren = graph.childrenOf(start)
if (anyChildren != None) and (index == 1000):
for node in graph.childrenOf(start):
if node not in path: #avoid cycles
if (shortest == None) or ( (shortest != None) and (len(path) < len(shortest))) :
newPath,index = DFSShortest(newGraph,node,end,path)
if newPath != None:
shortest = newPath
if (index != 1000):
return shortest,index
elif (anyChildren == None) and (index == 1000):
newPath,index = DFSShortest(newGraph,graph.parents[start],end,path)
if newPath != None:
shortest = newPath
if (index != 1000):
return shortest,index
return shortest,index
def BFS(graph, start, end, q):
initPath = [start]
q.append(initPath)
while len(q) != 0:
tmpPath = q.pop(0)
lastNode = tmpPath[len(tmpPath) - 1]
if str(lastNode) == str(end):
return lastNode.index
if (graph.childrenOf(lastNode) != []):
printPath(graph.childrenOf(lastNode))
for linkNode in graph.childrenOf(lastNode):
if linkNode not in tmpPath:
newPath = tmpPath + [linkNode]
q.append(newPath)
return None
class Digraph(object):
def __init__(self):
self.nodes = set([])
self.edges = {}
self.parents = {}
def addNode(self, node):
if node in self.nodes:
raise ValueError('Duplicate node')
else:
self.nodes.add(node)
self.edges[node] = []
#print "added edges = [] for node " + str(node)
def addEdge(self, edge):
src = edge.getSource()
dest = edge.getDestination()
self.edges[src].append(dest)
self.parents[dest] = src
def childrenOf(self, node):
if (self.edges[node]):
return self.edges[node]
else:
return None
def hasNode(self, node):
return node in self.nodes
def __str__(self):
res = ''
for k in self.edges:
for d in self.edges[k]:
res = res + str(k) + '->' + str(d) + '\n'
return res[:-1]
class Graph(Digraph):
def addEdge(self, edge):
Digraph.addEdge(self, edge)
def addEdgesForTreesWith4Nodes(g,childNode,factorNum,i,lenList,wordNodes):
if (i + factorNum + 1) < lenList:
g.addEdge(Edge(wordNodes[childNode + 1],wordNodes[i + factorNum + 1]))
if (i + factorNum + 2) < lenList:
g.addEdge(Edge(wordNodes[childNode + 1],wordNodes[i + factorNum + 2]))
def addEdgesForTreesWithMoreThan4Nodes(g,childNode,factorNum,i,lenList,wordNodes, numChildrenNodesThisLevel, numChildrenNodesPreviousLevel):
if (i + factorNum + numChildrenNodesPreviousLevel) < lenList:
g.addEdge(Edge(wordNodes[childNode + i],wordNodes[i + factorNum + numChildrenNodesPreviousLevel]))
if (i + factorNum + numChildrenNodesThisLevel + 1) < lenList:
g.addEdge(Edge(wordNodes[childNode + i],wordNodes[i + factorNum + numChildrenNodesPreviousLevel + 1]))
"""
Can try using iterative deepening to get the DFS space advantage with BFS's time and shallow
solution advantage. So can try running DFS with depth limits: try depth of tree = 1, then 2, 3,...etc
"""
"""
Also - you can avoid the log(n) overhead in DFS/BFS with a priority queue (had trouble downloaded and installing on my computer!)
"""
def iterativeDeepeningSolution(wordNodes, saveWord, saveWordTuple, lenList):
#rather than building entire graph, at each tree level, call DFS to see if solution found
#DFS will search going down left side of tree's child nodes first, but will eventually search
#every node, so takes too much time while not taking much space. However, if you use the level
#limitation idea from BFS, only building the tree level by level and then searching it with DFS,
#that is the idea of iterative deepening.
index = 0
q = []
shortest = None
saveNodes = wordNodes
i = 0
totalNodes = 1
numChildrenNodesPreviousLevel = 0
while i < lenList:
index = 0
if (i > 0):
numChildrenNodesPreviousLevel = numChildrenNodesThisLevel
numChildrenNodesThisLevel = 2**i #number of children nodes at level
if (i > 0):
totalNodes += numChildrenNodesThisLevel
if (numChildrenNodesThisLevel > 1) and (numChildrenNodesThisLevel <= 32): #only search 32 children nodes or less (level 5 of tree, 2**5 = 32):
#print "build graph - if previous level already searched - just add this level of children nodes"
if (numChildrenNodesThisLevel == 2): #new graph since none built when it was just a root node
g = Graph()
for n in range(numChildrenNodesThisLevel + 1):
g.addNode(wordNodes[n])
else: #use graph from last level of children added - don't rebuild graph
n = numChildrenNodesThisLevel - 1
while (n < lenList) and (n < (totalNodes)):
g.addNode(wordNodes[n])
n += 1
elif (numChildrenNodesThisLevel > 32): #only search 32 children nodes or less (level 5 of tree, 2**5 = 32)
print "word graph just searched: " + str(saveWord)
print "cannot go further searching in iterative deepening - tree will take too much space and time to search"
print "Tree Level = " + str(i) + " num children at this level " + str(numChildrenNodesThisLevel) + " total nodes in graph " + str(totalNodes)
print "Last Level Searched " + str(i - 1) + " num children at this level " + str(numChildrenNodesPreviousLevel) + " total nodes in graph " + str(totalNodes - numChildrenNodesThisLevel)
print " "
return
if (numChildrenNodesThisLevel > 2):
childNode = 0
while childNode < numChildrenNodesPreviousLevel:
if (childNode > 0):
factorNum = childNode * 2
else:
factorNum = childNode
if (numChildrenNodesThisLevel == 4):
addEdgesForTreesWith4Nodes(g,childNode,factorNum,i,lenList,wordNodes)
elif (numChildrenNodesThisLevel > 4): addEdgesForTreesWithMoreThan4Nodes(g,childNode,factorNum,i,lenList,wordNodes,numChildrenNodesThisLevel,numChildrenNodesPreviousLevel)
childNode += 1
startNode = wordNodes[0]
endNode = Node(str(saveWordTuple),0)
index = 1000
path,index = DFSShortest(g, startNode, endNode, q, shortest, index)
if (index != 1000): #made up error - not searching 1000 nodes or more at this time - soln found
print saveWord + " = " + str(index + 1)
print " "
return
i += 1
wordNodes = saveNodes
elif (numChildrenNodesThisLevel == 2): #so new graph just formed of 3 nodes (including root) - no edges on it yet
g.addEdge(Edge(wordNodes[0],wordNodes[1]))
g.addEdge(Edge(wordNodes[0],wordNodes[2]))
startNode = wordNodes[0]
endNode = Node(str(saveWordTuple),0)
index = 1000
path,index = DFSShortest(g, startNode, endNode, q, shortest, index)
if (index != 1000): #made up error - not searching 1000 nodes or more at this time - soln found
print saveWord + " = " + str(index + 1)
print " "
return
i += 1
wordNodes = saveNodes
elif (numChildrenNodesThisLevel == 1):
startNode = wordNodes[0]
oneNode = Node(str(saveWordTuple),0)
if str(oneNode) == str(startNode):
print saveWord + " = " + str(startNode.index + 1)
print " "
return
else:
i += 1
wordNodes = saveNodes
def wordNumberOrdering(wordList):
for word in wordList:
permutationTuples = []
withDupsList = []
noDupsList = []
noDupsStringList = []
index = 0
outputDict = {}
saveWord = ""
saveWordTuple = []
wordLen = len(word)
if (wordLen <= 10):
saveWord = word
saveWordTuple = tuple(saveWord,)
permutationTuples = permutationsMemoized(word)
for tupleStr in permutationTuples:
withDupsList.append(tupleStr)
noDupsList = remove_duplicates(withDupsList)
lenList = len(noDupsList)
noDupsList.sort()
wordNodes = []
i = 0
for name in noDupsList:
wordNodes.append(Node(str(name),i))
i += 1 #index of list to print when found for this puzzle
iterativeDeepeningSolution(wordNodes, saveWord, saveWordTuple, lenList)
elif (wordLen > 20):
print word
print "word length too long (> 20 chars): " + str(wordLen)
print " "
elif (wordLen >= 11):
print word
print "word length too long for this current solution to puzzle (>= 11 chars): " + str(wordLen)
print " "
def oneWordInputFromCommandLineAtATime(word):
permutationTuples = []
withDupsList = []
noDupsList = []
noDupsStringList = []
index = 0
outputDict = {}
saveWord = ""
saveWordTuple = []
saveWord = word
saveWordTuple = tuple(saveWord,)
permutationTuples = permutationsMemoized(word)
for tupleStr in permutationTuples:
withDupsList.append(tupleStr)
noDupsList = remove_duplicates(withDupsList)
lenList = len(noDupsList)
noDupsList.sort()
wordNodes = []
i = 0
for name in noDupsList:
wordNodes.append(Node(str(name),i))
i += 1 #index of list to print when found for this puzzle
iterativeDeepeningSolution(wordNodes, saveWord, saveWordTuple, lenList)
def getInputFromCommandLine():
guessWord = ""
guessWordLowCase = ""
validWord = False
takeInput = True
while (takeInput == True):
guessWord = raw_input('Enter word, or a "." to indicate that you are finished: ').decode('utf-8')
guessWordLowCase = guessWord.lower()
print "word being considered " + guessWordLowCase
if (guessWordLowCase == "."):
takeInput = False
else: #otherwise consider this word as an input from command line
validWord = isValidWord(guessWordLowCase)
if (validWord == False):
guessWordLowCase + " is INVALID"
print "Invalid word, please try again"
print " "
else:
oneWordInputFromCommandLineAtATime(guessWordLowCase)
print "Goodbye!"
if __name__ == '__main__':
#taking input word by word from command line
getInputFromCommandLine()
#uncomment the following if you want to take the input from words.txt, a file of words to read in instead
#wordList = loadWords()
#wordNumberOrdering(wordList)
3 个回答
我昨天听说了这个问题,跟这个论坛没有关系。下面是一个简单的C语言解决方案。里面有一些Win32的内容,但应该很容易转换成Python(或者其他语言)。这个方法依赖于一个事实:有重复字母的单词数量可以用多项式系数来计算,公式是(SUM(mi))!/PRODUCT(mi!)。这里使用的是小写字母(a-z),代码比较粗糙,完全没有错误控制。下面是这个程序(multinom.exe)在运行的情况,还有另一个程序multinom2.exe,它可以反向解决问题……给定字母和索引,找出字符串。以下是代码
multinom.exe question
index = 24572
multinom2.exe noitseuq 24572
string = question
#include <Windows.h>
#include <stdio.h>
ULONGLONG fact( ULONGLONG n )
{
if ( n == 0 )
return 1;
return n * fact(n-1);
}
ULONGLONG multinom( INT mults[] ) // = (SUM(M))! / PROD(M!)
{
ULONGLONG n = 0;
for ( INT i=0; i<26; i++ )
n += mults[i];
ULONGLONG result = fact(n);
for ( INT i=0; i<26; i++ )
if ( mults[i] )
result /= fact(mults[i]);
return result;
}
// uses a~z as alphabet; strings up to 20 chars; no safeguards or E/C whatsoever.
INT main ( INT argc, LPSTR* argv )
{
ULONGLONG index = 1; // we'll add to this any earlier strings
CHAR str[21];
lstrcpy(str, argv[1]);
INT mults[26] = {0}; // initialize multiplicities to zero
for ( CHAR *p=str; *p != 0; p++ ) // set multiplicities that are non-zero
{
mults[*p - 'a'] += 1;
}
for ( CHAR *p = str; *p != 0; p++ ) // iterate through the characters of str
{
for ( INT i=0; i < (*p - 'a'); i++ ) // check each character lexicographically before *p
{
if ( mults[i] ) // it's in the string; count (as earlier in the list) the strings that start with it
{
mults[i] -= 1;
index += multinom(mults);
mults[i] += 1;
}
}
// At this point we've counted all the words that start with an earlier character.
// Any remaining earlier words must match up to this point. So ...
mults[*p - 'a'] -= 1; // p will be incremented so, in effect, forget this character and move on
}
printf("index = %I64u\n", index);
return 0;
}
这里有一个非递归的版本,灵感来自于DSM的解决方案(整体思路完全是他的):
from collections import Counter
from math import factorial
def number_of_distinct_permutations(counted):
result = factorial(sum(counted))
for each in counted:
result //= factorial(each)
return result
def anagram_number(iterable):
elems = list(iterable)
tally = Counter()
index = 1
while elems:
current = elems.pop()
tally[current] += 1
for item in tally:
if item < current:
tally[item] -= 1
index += number_of_distinct_permutations(tally.values())
tally[item] += 1
return index
我觉得这里的关键是要考虑不同排列的排名。比如说,给定一个字符串BAAA,我们知道它的排名一定大于所有以A开头的排列。所以如果我们能计算出有多少个以A开头的排列,就不需要一个一个去列举它们了。那到底有多少个呢?其实就是有多少个不同的以A开头的排列。这很简单可以计算出来,但接下来我们得弄清楚BAAA在所有以B开头的排列中处于什么位置——这就变成了要找出AAA在第一个以B开头的排列之前的位置(我们知道这个位置就是A的数量)。
像这样的思路应该是可行的,它只是对这个想法进行了概括。(声明:这个方法还没经过测试——可能会有我忘记的边界情况等等,但我对这个基本思路还是很有信心的)。
from math import factorial
from collections import Counter
def number_of_distinct_permutations(counts):
f = factorial(sum(counts.values()))
for letter, count in counts.items():
f //= factorial(count)
return f
def compute_index(word, index=0):
if not word:
return index + 1
pending = Counter(word)
head = word[0]
for p in sorted(pending):
if p < head:
index += number_of_distinct_permutations(pending - Counter(p))
if p == head:
index += compute_index(word[1:])
return index
test_data = {"ABAB": 2,
"AAAB": 1,
"BAAA": 4,
"QUESTION": 24572,
"BOOKKEEPER": 10743,
"NONINTUITIVENESS": 8222334634}
print("word, reference, calculated")
for k,v in sorted(test_data.items()):
print (k, v, compute_index(k))
这段代码会产生
word, reference, calculated
AAAB 1 1
ABAB 2 2
BAAA 4 4
BOOKKEEPER 10743 10743
NONINTUITIVENESS 8222334634 8222334634
QUESTION 24572 24572