How to insert a column at specified position without listing all the existing column names?
I have this dataframe:
from pyspark.sql import functions as F
df = spark.range(1).select(
F.lit(11).alias('a'),
F.lit(22).alias('b'),
F.lit(33).alias('c'),
)
df.show()
# +---+---+---+
# | a| b| c|
# +---+---+---+
# | 11| 22| 33|
# +---+---+---+
I want to insert a column right after column a
.
I know I can list all the columns in the order I like:
df.select('a', F.lit('foo'), 'b', 'c')
But I want to insert a column without listing the existing column names.
Something like this could work:
position = 1
cols = df.columns
df = df.select(*cols[:position], F.lit('foo'), *cols[position:])
df.show()
# +---+---+---+---+
# | a|foo| b| c|
# +---+---+---+---+
# | 11|foo| 22| 33|
# +---+---+---+---+
A function to insert or move a column could be created:
def place_col(df, position: int, col_name: str, col_expr=None):
cols = [c for c in df.columns if c != col_name]
col = F.col(col_name) if col_expr is None else col_expr
df = df.select(*cols[:position], col.alias(col_name), *cols[position:])
return df
df = place_col(df, 1, 'foo', F.lit('bar'))
df.show()
# +---+---+---+---+
# | a|foo| b| c|
# +---+---+---+---+
# | 11|bar| 22| 33|
# +---+---+---+---+
df = place_col(df, 0, 'c')
df.show()
# +---+---+---+
# | c| a| b|
# +---+---+---+
# | 33| 11| 22|
# +---+---+---+