如何在pyspark操作中轻松使用自定义类方法？

import re class Age: # age is a number representing the age of a person def __init__(self, age): self.age = age def __eq__(self, other): return self.age == self.__parse(other) def __lt__(self, other): return self.age < self.__parse(other) def __gt__(self, other): return self.age > self.__parse(other) def __le__(self, other): return self.age <= self.__parse(other) def __ge__(self, other): return self.age >= self.__parse(other) def __parse(self, age): return int(''.join(re.findall(r'\d', age))) # Let's test this class if __name__ == '__main__': print(Age(18) == 'noise18noise') print(Age(18) <= 'aka 1 fakj 8 jal') print(Age(18) >= 'jaa 18 ka') print(Age(18) < '1 kda 9') print(Age(18) > 'akfa 1 na 7 noise') Output: True True True True True

Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/opt/spark-2.3.1-bin-hadoop2.7/python/pyspark/sql/column.py", line 116, in _ njc = getattr(self._jc, name)(jc) File "/opt/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1248, in __call__ File "/opt/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1218, in _build_args File "/opt/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1218, in <listcomp> File "/opt/spark-2.3.1-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 298, in get_command_part AttributeError: 'Age' object has no attribute '_get_object_id'

>>> import pyspark.sql.functions as F >>> import pyspark.sql.types as T >>> eq20 = F.udf(lambda c: c == Age(20), T.BooleanType()) >>> ages.filter(eq20(ages.Age)).show() +-----+------------+ | Name| Age| +-----+------------+ |alpha|noise20noise| +-----+------------+

# other imports here ... import pyspark.sql.functions as F import pyspark.sql.types as T def connect_to_pyspark(function): return F.udf(function, T.BooleanType()) class Age(str): ... @connect_to_pyspark def __eq__(self, other): return self.age == self.__parse(other) ... # do the same decorator for the other comparative methods

1条回答

网友

1楼 · 发布于 2024-06-16 11:36:39

获取ages.Age == Age(20)将非常困难，因为spark不尊重实现__eq__的python约定。稍后将对此进行详细介绍，但如果您可以执行Age(20) == ages.Age，那么您有一些选择。IMHO，最简单的方法是只在自定义项中包装解析逻辑：

parse_udf = F.udf(..., T.IntegerType())
class Age:
    ...
    def __eq__(self, other: Column):
        return F.lit(self.age) == parse_udf(other)

注意Age并不是str的子类，这只会造成一个伤害的世界。如果您想使用decorator，那么decorator不应该返回udf，它应该返回一个应用udf的函数。像这样：

import re
import pyspark.sql.functions as F
import pyspark.sql.types as T

def connect_to_pyspark(function):
  def helper(age, other):
    myUdf = F.udf(lambda item_from_other: function(age, item_from_other), T.BooleanType())
    return myUdf(other)
  return helper

class Age:

    def __init__(self, age):
      self.age = 45

    def __parse(self, other):
      return int(''.join(re.findall(r'\d', other)))

    @connect_to_pyspark
    def __eq__(self, other):
        return self.age == self.__parse(other)

ages.withColumn("eq20", Age(20) == df.Age).show()

更多关于为什么需要使用Ages(20) == ages.Age。在python中，如果您执行a == b，并且a的类不知道如何与b进行比较，那么它应该返回NotImplemented，然后python将尝试b.__eq__(a)，但是spark从不返回NotImplemented，因此Age的__eq__只有在表达式（）中首先有它时才会被调用。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章