使用flas减少python对象trie的内存使用

class TrieNode: values = {} def __init__(self): self.word = None self.children = {} global NodeCount NodeCount += 1 def insert( self, word, value): node = self for letter in word: if letter not in node.children: node.children[letter] = TrieNode() node = node.children[letter] TrieNode.values[word] = value node.word = word

>>> class TrieNode: NodeCount = 0 def __init__(self): self.word = None self.children = {} #global NodeCount TrieNode.NodeCount += 1 >>> tn = TrieNode() >>> sys.getsizeof(tn) + sys.getsizeof(tn.__dict__) 176

class TrieNode: values = {} # shared amon all instances so only one structure? NodeCount = 0 __slots__ = "word", "children" def __init__(self): self.word = None self.children = {} #global NodeCount TrieNode.NodeCount += 1 def insert( self, word, value = None): # value is a string id like "XYZ999999999" node = self for letter in word: codepoint = ord(letter) if codepoint not in node.children: node.children[codepoint] = TrieNode() node = node.children[codepoint] node.word = word if value is not None: lost = TrieNode.values.setdefault(word, []) TrieNode.values[word].append(intern(str(value)))

1条回答

网友

1楼 · 发布于 2024-04-18 23:58:06

低挂水果：use ^{} in your node class，否则，每个TrieNode对象都携带一个dict。在

class TrieNode:
    __slots__ = "word", "children"
    def __init__(self):
        self.word = None
        self.children = {}

现在，每个trinode对象将不携带属性dict。比较大小：

^{pr2}$

对比：

>>> class TrieNode:
...     __slots__ = "word", "children"
...     def __init__(self):
...         self.is_word = False
...         self.children = {}
...
>>> sys.getsizeof(tn)
56
>>> tn.__dict__
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: 'TrieNode' object has no attribute '__dict__'

另一个优化，使用int对象。小的int对象被缓存，很可能大多数字符都在该范围内，但是即使它们不是，在Python中仍然强大的int甚至比单个字符串都小：

>>> 'ñ'
'ñ'
>>> ord('ñ')
241
>>> sys.getsizeof('ñ')
74
>>> sys.getsizeof(ord('ñ'))
28

所以你可以做一些类似的事情：

def insert( self, word, value):
    node = self
    for letter in word:
        code_point = ord(letter)
        if code_point not in node.children: 
            node.children[code_point] = TrieNode()

        node = node.children[code_point]
    node.is_word = True #Don't save the word, simply a reference to a singleton

此外，您还保留了一个类变量valuesdict，它的增长速度非常快，但是这个信息是多余的。你说：

I just add a dictionary to hold some value with quick access (I need it)

您可以从路径重建单词。它应该是比较快的，我会认真考虑不要有这个dict。检查一下只需容纳一百万个字符串就需要多少内存：

>>> d = {str(i):i for i in range(1000000)}
>>> (sum(sizeof(k)+sizeof(v) for k,v in d.items()) + sizeof(d)) * 1e-9
0.12483203000000001

你可以这样做：

class TrieNode:
    __slots__ = "value", "children"
    def __init__(self):
        self.value = None
        self.children = {}

    def insert( self, word, value):
        node = self
        for letter in word:
            code_point = ord(letter)
            if code_point not in node.children: 
                node.children[code_point] = TrieNode()

            node = node.children[code_point]
        node.value = value #this serves as a signal that it is a word


    def get(word, default=None):
        val = self._get_value(word)
        if val is None:
            return default
        else:
            return val

    def _get_value(self, word):
        node = self
        for letter in word:
            code_point = ord(letter)
            try:
                node = node.children[code_point]
            except KeyError:
                return None
        return node.value

相关问题更多 >

编程相关推荐

热门问题

热门文章