Python 它没有解释如何创建id列。 indexes=[2,3,6,7] df[indexes] indexes=np.arange(df.count()) df_indexed=df.withColumn('index', indexes) indexes

Python 它没有解释如何创建id列。 indexes=[2,3,6,7] df[indexes] indexes=np.arange(df.count()) df_indexed=df.withColumn('index', indexes) indexes,python,apache-spark,bigdata,pyspark,rdd,Python,Apache Spark,Bigdata,Pyspark,Rdd,它没有解释如何创建id列。 indexes=[2,3,6,7] df[indexes] indexes=np.arange(df.count()) df_indexed=df.withColumn('index', indexes) indexes=[2,3,6,7] df1.where("index in indexes").collect() from pyspark.sql import Row from pyspark.sql.types import StructType

它没有解释如何创建id列。
indexes=[2,3,6,7] 
df[indexes]
indexes=np.arange(df.count())
df_indexed=df.withColumn('index', indexes)
 indexes=[2,3,6,7] 
 df1.where("index in indexes").collect()
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, LongType

row = Row("char")
row_with_index = Row("char", "index")

df = sc.parallelize(row(chr(x)) for x in range(97, 112)).toDF()
df.show(5)

## +----+
## |char|
## +----+
## |   a|
## |   b|
## |   c|
## |   d|
## |   e|
## +----+
## only showing top 5 rows

# This part is not tested but should work and save some work later
schema  = StructType(
    df.schema.fields[:] + [StructField("index", LongType(), False)])

indexed = (df.rdd # Extract rdd
    .zipWithIndex() # Add index
    .map(lambda ri: row_with_index(*list(ri[0]) + [ri[1]])) # Map to rows
    .toDF(schema)) # It will work without schema but will be more expensive

# inSet in Spark < 1.3
indexed.where(col("index").isin(indexes))
from pyspark.sql.functions import monotonicallyIncreasingId
df.select(monotonicallyIncreasingId().alias("rowId"),"*")
val index_array=(1 to df.count.toInt).toArray

index_array: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
final_df = sc.parallelize((df.collect.map(
    x=>(x(0),x(1))) zip index_array).map(
    x=>(x._1._1.toString,x._1._2.toString,x._2))).
    toDF("column_name")
df.where(df.id == n).show()
df = spark.createDataFrame([(1, 143.5, 5.6, 28, 'M', 100000),\
                          (2, 167.2, 5.4, 45, 'M', None),\
                          (3, None , 5.2, None, None, None),\
                          ], ['id', 'weight', 'height', 'age', 'gender', 'income'])
df.where('id == 3').show()
df.where(df.id == 3).show()
id = {"2", "3"}
df.where(df.id.isin(id)).show()
from pyspark.sql.functions import monotonically_increasing_id
df.withColumn("Atr4", monotonically_increasing_id())