擅长:python、mysql、java
<p>您必须只筛选出应该合并为<code>true</code>并聚合的行。然后将其与所有剩余行合并。你知道吗</p>
<pre><code>import pyspark.sql.functions as functions
df = sqlContext.createDataFrame([
(True, 1),
(True, 1),
(True, 2),
(False, 3),
(False, 1),
], ("shouldMerge", "number"))
false_df = df.filter("shouldMerge = false")
true_df = df.filter("shouldMerge = true")
result = true_df.groupBy("shouldMerge")\
.agg(functions.sum("number").alias("number"))\
.unionAll(false_df)
df = sqlContext.createDataFrame([
(1, 1),
(2, 1),
(1, 2),
(-1, 3),
(-1, 1),
], ("mergeId", "number"))
merge_condition = df["mergeId"] > -1
remaining = ~merge_condition
grouby_field = "mergeId"
false_df = df.filter(remaining)
true_df = df.filter(merge_condition)
result = true_df.groupBy(grouby_field)\
.agg(functions.sum("number").alias("number"))\
.unionAll(false_df)
result.show()
</code></pre>