PySpark：具有不同列的数据帧的动态联合

import pyspark from pyspark import SparkContext from pyspark.sql.functions import lit from pyspark.sql import SparkSession sqlContext = pyspark.SQLContext(pyspark.SparkContext()) df01 = sqlContext.createDataFrame([(1, 2, 3), (9, 5, 6)], ("C1", "C2", "C3")) df02 = sqlContext.createDataFrame([(11,12, 13), (10, 15, 16)], ("C2", "C3", "C4")) df03 = sqlContext.createDataFrame([(111,112), (110, 115)], ("C1", "C4")) df01_add = df01.withColumn("C4", lit(0)).select("c1","c2","c3","c4") df02_add = df02.withColumn("C1", lit(0)).select("c1","c2","c3","c4") df03_add = df03.withColumn("C2", lit(0)).withColumn("C3", lit(0)).select("c1","c2","c3","c4") df_uni = df01_add.union(df02_add).union(df03_add) df_uni.show()

class Student: def ___Init__ (self,m1,m2): self.m1 = m1 self.m2 = m2 def sum(self,c1=None,c2=None,c3=None,c4=None): s = 0 if c1!= None and c2 != None and c3 != None: s = c1+c2+c3 elif c1 != None and c2 != None: s = c1+c2 else: s = c1 return s print(s1.sum(55,65,23))

1条回答

网友

1楼 · 发布于 2024-05-16 13:08:04

也许有很多更好的方法可以做到这一点，但也许下面的方法对将来的任何人都是有用的。在

from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

spark = SparkSession.builder\
    .appName("DynamicFrame")\
    .getOrCreate()

df01 = spark.createDataFrame([(1, 2, 3), (9, 5, 6)], ("C1", "C2", "C3"))
df02 = spark.createDataFrame([(11,12, 13), (10, 15, 16)], ("C2", "C3", "C4"))
df03 = spark.createDataFrame([(111,112), (110, 115)], ("C1", "C4"))

dataframes = [df01, df02, df03]

# Create a list of all the column names and sort them
cols = set()
for df in dataframes:
    for x in df.columns:
        cols.add(x)
cols = sorted(cols)

# Create a dictionary with all the dataframes
dfs = {}
for i, d in enumerate(dataframes):
    new_name = 'df' + str(i)  # New name for the key, the dataframe is the value
    dfs[new_name] = d
    # Loop through all column names. Add the missing columns to the dataframe (with value 0)
    for x in cols:
        if x not in d.columns:
            dfs[new_name] = dfs[new_name].withColumn(x, lit(0))
    dfs[new_name] = dfs[new_name].select(cols)  # Use 'select' to get the columns sorted

# Now put it al together with a loop (union)
result = dfs['df0']      # Take the first dataframe, add the others to it
dfs_to_add = dfs.keys()  # List of all the dataframes in the dictionary
dfs_to_add.remove('df0') # Remove the first one, because it is already in the result
for x in dfs_to_add:
    result = result.union(dfs[x])
result.show()

输出：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章