Creating Dataframes

Based on WafaStudies PySpark tutorial.

Imports

import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder\
                    .appName('Spark')\
                    .master("local[*]")\
                    .getOrCreate()
your 131072x1 screen size is bogus. expect trouble
23/10/25 15:10:15 WARN Utils: Your hostname, PC resolves to a loopback address: 127.0.1.1; using 172.29.148.244 instead (on interface eth0)
23/10/25 15:10:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/25 15:10:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

Create Dataframe

help(spark.createDataFrame)
Help on method createDataFrame in module pyspark.sql.session:

createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
    Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
    or a :class:`numpy.ndarray`.
    
    .. versionadded:: 2.0.0
    
    .. versionchanged:: 3.4.0
        Supports Spark Connect.
    
    Parameters
    ----------
    data : :class:`RDD` or iterable
        an RDD of any kind of SQL data representation (:class:`Row`,
        :class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
        :class:`pandas.DataFrame` or :class:`numpy.ndarray`.
    schema : :class:`pyspark.sql.types.DataType`, str or list, optional
        a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
        column names, default is None. The data type string format equals to
        :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
        omit the ``struct<>``.
    
        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.
    
        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of either :class:`Row`,
        :class:`namedtuple`, or :class:`dict`.
    
        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must
        match the real data, or an exception will be thrown at runtime. If the given schema is
        not :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be
        "value". Each record will also be wrapped into a tuple, which can be converted to row
        later.
    samplingRatio : float, optional
        the sample ratio of rows used for inferring. The first few rows will be used
        if ``samplingRatio`` is ``None``.
    verifySchema : bool, optional
        verify data types of every row against schema. Enabled by default.
    
        .. versionadded:: 2.1.0
    
    Returns
    -------
    :class:`DataFrame`
    
    Notes
    -----
    Usage with `spark.sql.execution.arrow.pyspark.enabled=True` is experimental.
    
    Examples
    --------
    Create a DataFrame from a list of tuples.
    
    >>> spark.createDataFrame([('Alice', 1)]).show()
    +-----+---+
    |   _1| _2|
    +-----+---+
    |Alice|  1|
    +-----+---+
    
    Create a DataFrame from a list of dictionaries.
    
    >>> d = [{'name': 'Alice', 'age': 1}]
    >>> spark.createDataFrame(d).show()
    +---+-----+
    |age| name|
    +---+-----+
    |  1|Alice|
    +---+-----+
    
    Create a DataFrame with column names specified.
    
    >>> spark.createDataFrame([('Alice', 1)], ['name', 'age']).show()
    +-----+---+
    | name|age|
    +-----+---+
    |Alice|  1|
    +-----+---+
    
    Create a DataFrame with the explicit schema specified.
    
    >>> from pyspark.sql.types import *
    >>> schema = StructType([
    ...    StructField("name", StringType(), True),
    ...    StructField("age", IntegerType(), True)])
    >>> spark.createDataFrame([('Alice', 1)], schema).show()
    +-----+---+
    | name|age|
    +-----+---+
    |Alice|  1|
    +-----+---+
    
    Create a DataFrame with the schema in DDL formatted string.
    
    >>> spark.createDataFrame([('Alice', 1)], "name: string, age: int").show()
    +-----+---+
    | name|age|
    +-----+---+
    |Alice|  1|
    +-----+---+
    
    Create an empty DataFrame.
    When initializing an empty DataFrame in PySpark, it's mandatory to specify its schema,
    as the DataFrame lacks data from which the schema can be inferred.
    
    >>> spark.createDataFrame([], "name: string, age: int").show()
    +----+---+
    |name|age|
    +----+---+
    +----+---+
    
    Create a DataFrame from Row objects.
    
    >>> from pyspark.sql import Row
    >>> Person = Row('name', 'age')
    >>> df = spark.createDataFrame([Person("Alice", 1)])
    >>> df.show()
    +-----+---+
    | name|age|
    +-----+---+
    |Alice|  1|
    +-----+---+
    
    Create a DataFrame from a pandas DataFrame.
    
    >>> spark.createDataFrame(df.toPandas()).show()  # doctest: +SKIP
    +-----+---+
    | name|age|
    +-----+---+
    |Alice|  1|
    +-----+---+
    >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
    +---+---+
    |  0|  1|
    +---+---+
    |  1|  2|
    +---+---+
data = [(1,'Pedro'), (2, 'Guilherme')]
df = spark.createDataFrame(data=data)

df.show()
df.printSchema()
                                                                                
+---+---------+
| _1|       _2|
+---+---------+
|  1|    Pedro|
|  2|Guilherme|
+---+---------+

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
data = [(1,'Pedro'), (2, 'Guilherme')]
df = spark.createDataFrame(data=data, schema=["id", "name"])

df.show()
df.printSchema()
+---+---------+
| id|     name|
+---+---------+
|  1|    Pedro|
|  2|Guilherme|
+---+---------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
from pyspark.sql.types import *
help(StructType)
Help on class StructType in module pyspark.sql.types:

class StructType(DataType)
 |  StructType(fields: Optional[List[pyspark.sql.types.StructField]] = None)
 |  
 |  Struct type, consisting of a list of :class:`StructField`.
 |  
 |  This is the data type representing a :class:`Row`.
 |  
 |  Iterating a :class:`StructType` will iterate over its :class:`StructField`\s.
 |  A contained :class:`StructField` can be accessed by its name or position.
 |  
 |  Examples
 |  --------
 |  >>> from pyspark.sql.types import *
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1["f1"]
 |  StructField('f1', StringType(), True)
 |  >>> struct1[0]
 |  StructField('f1', StringType(), True)
 |  
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", CharType(10), True)])
 |  >>> struct2 = StructType([StructField("f1", CharType(10), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", VarcharType(10), True)])
 |  >>> struct2 = StructType([StructField("f1", VarcharType(10), True)])
 |  >>> struct1 == struct2
 |  True
 |  >>> struct1 = StructType([StructField("f1", StringType(), True)])
 |  >>> struct2 = StructType([StructField("f1", StringType(), True),
 |  ...     StructField("f2", IntegerType(), False)])
 |  >>> struct1 == struct2
 |  False
 |  
 |  The below example demonstrates how to create a DataFrame based on a struct created
 |  using class:`StructType` and class:`StructField`:
 |  
 |  >>> data = [("Alice", ["Java", "Scala"]), ("Bob", ["Python", "Scala"])]
 |  >>> schema = StructType([
 |  ...     StructField("name", StringType()),
 |  ...     StructField("languagesSkills", ArrayType(StringType())),
 |  ... ])
 |  >>> df = spark.createDataFrame(data=data, schema=schema)
 |  >>> df.printSchema()
 |  root
 |   |-- name: string (nullable = true)
 |   |-- languagesSkills: array (nullable = true)
 |   |    |-- element: string (containsNull = true)
 |  >>> df.show()
 |  +-----+---------------+
 |  | name|languagesSkills|
 |  +-----+---------------+
 |  |Alice|  [Java, Scala]|
 |  |  Bob|[Python, Scala]|
 |  +-----+---------------+
 |  
 |  Method resolution order:
 |      StructType
 |      DataType
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, key: Union[str, int]) -> pyspark.sql.types.StructField
 |      Access fields by name or slice.
 |  
 |  __init__(self, fields: Optional[List[pyspark.sql.types.StructField]] = None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __iter__(self) -> Iterator[pyspark.sql.types.StructField]
 |      Iterate the fields
 |  
 |  __len__(self) -> int
 |      Return the number of fields.
 |  
 |  __repr__(self) -> str
 |      Return repr(self).
 |  
 |  add(self, field: Union[str, pyspark.sql.types.StructField], data_type: Union[str, pyspark.sql.types.DataType, NoneType] = None, nullable: bool = True, metadata: Optional[Dict[str, Any]] = None) -> 'StructType'
 |      Construct a :class:`StructType` by adding new elements to it, to define the schema.
 |      The method accepts either:
 |      
 |          a) A single parameter which is a :class:`StructField` object.
 |          b) Between 2 and 4 parameters as (name, data_type, nullable (optional),
 |             metadata(optional). The data_type parameter may be either a String or a
 |             :class:`DataType` object.
 |      
 |      Parameters
 |      ----------
 |      field : str or :class:`StructField`
 |          Either the name of the field or a :class:`StructField` object
 |      data_type : :class:`DataType`, optional
 |          If present, the DataType of the :class:`StructField` to create
 |      nullable : bool, optional
 |          Whether the field to add should be nullable (default True)
 |      metadata : dict, optional
 |          Any additional metadata (default None)
 |      
 |      Returns
 |      -------
 |      :class:`StructType`
 |      
 |      Examples
 |      --------
 |      >>> from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 |      >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
 |      >>> struct2 = StructType([StructField("f1", StringType(), True),
 |      ...     StructField("f2", StringType(), True, None)])
 |      >>> struct1 == struct2
 |      True
 |      >>> struct1 = StructType().add(StructField("f1", StringType(), True))
 |      >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |      >>> struct1 == struct2
 |      True
 |      >>> struct1 = StructType().add("f1", "string", True)
 |      >>> struct2 = StructType([StructField("f1", StringType(), True)])
 |      >>> struct1 == struct2
 |      True
 |  
 |  fieldNames(self) -> List[str]
 |      Returns all field names in a list.
 |      
 |      Examples
 |      --------
 |      >>> from pyspark.sql.types import StringType, StructField, StructType
 |      >>> struct = StructType([StructField("f1", StringType(), True)])
 |      >>> struct.fieldNames()
 |      ['f1']
 |  
 |  fromInternal(self, obj: Tuple) -> 'Row'
 |      Converts an internal SQL object into a native Python object.
 |  
 |  jsonValue(self) -> Dict[str, Any]
 |  
 |  needConversion(self) -> bool
 |      Does this type needs conversion between Python object and internal SQL object.
 |      
 |      This is used to avoid the unnecessary conversion for ArrayType/MapType/StructType.
 |  
 |  simpleString(self) -> str
 |  
 |  toInternal(self, obj: Tuple) -> Tuple
 |      Converts a Python object into an internal SQL object.
 |  
 |  ----------------------------------------------------------------------
 |  Class methods defined here:
 |  
 |  fromJson(json: Dict[str, Any]) -> 'StructType' from builtins.type
 |      Constructs :class:`StructType` from a schema defined in JSON format.
 |      
 |      Below is a JSON schema it must adhere to::
 |      
 |          {
 |            "title":"StructType",
 |            "description":"Schema of StructType in json format",
 |            "type":"object",
 |            "properties":{
 |               "fields":{
 |                  "description":"Array of struct fields",
 |                  "type":"array",
 |                  "items":{
 |                      "type":"object",
 |                      "properties":{
 |                         "name":{
 |                            "description":"Name of the field",
 |                            "type":"string"
 |                         },
 |                         "type":{
 |                            "description": "Type of the field. Can either be
 |                                            another nested StructType or primitive type",
 |                            "type":"object/string"
 |                         },
 |                         "nullable":{
 |                            "description":"If nulls are allowed",
 |                            "type":"boolean"
 |                         },
 |                         "metadata":{
 |                            "description":"Additional metadata to supply",
 |                            "type":"object"
 |                         },
 |                         "required":[
 |                            "name",
 |                            "type",
 |                            "nullable",
 |                            "metadata"
 |                         ]
 |                      }
 |                 }
 |              }
 |           }
 |         }
 |      
 |      Parameters
 |      ----------
 |      json : dict or a dict-like object e.g. JSON object
 |          This "dict" must have "fields" key that returns an array of fields
 |          each of which must have specific keys (name, type, nullable, metadata).
 |      
 |      Returns
 |      -------
 |      :class:`StructType`
 |      
 |      Examples
 |      --------
 |      >>> json_str = '''
 |      ...  {
 |      ...      "fields": [
 |      ...          {
 |      ...              "metadata": {},
 |      ...              "name": "Person",
 |      ...              "nullable": true,
 |      ...              "type": {
 |      ...                  "fields": [
 |      ...                      {
 |      ...                          "metadata": {},
 |      ...                          "name": "name",
 |      ...                          "nullable": false,
 |      ...                          "type": "string"
 |      ...                      },
 |      ...                      {
 |      ...                          "metadata": {},
 |      ...                          "name": "surname",
 |      ...                          "nullable": false,
 |      ...                          "type": "string"
 |      ...                      }
 |      ...                  ],
 |      ...                  "type": "struct"
 |      ...              }
 |      ...          }
 |      ...      ],
 |      ...      "type": "struct"
 |      ...  }
 |      ...  '''
 |      >>> import json
 |      >>> scheme = StructType.fromJson(json.loads(json_str))
 |      >>> scheme.simpleString()
 |      'struct<Person:struct<name:string,surname:string>>'
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from DataType:
 |  
 |  __eq__(self, other: Any) -> bool
 |      Return self==value.
 |  
 |  __hash__(self) -> int
 |      Return hash(self).
 |  
 |  __ne__(self, other: Any) -> bool
 |      Return self!=value.
 |  
 |  json(self) -> str
 |  
 |  ----------------------------------------------------------------------
 |  Class methods inherited from DataType:
 |  
 |  typeName() -> str from builtins.type
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from DataType:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
data = [(1,'Pedro'), (2, 'Guilherme')]
schema = StructType([StructField(name='id', dataType=IntegerType()),
                     StructField(name='name', dataType=StringType())])

df = spark.createDataFrame(data=data, schema=schema)

df.show()
df.printSchema()
+---+---------+
| id|     name|
+---+---------+
|  1|    Pedro|
|  2|Guilherme|
+---+---------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
data = [{'id':1, 'name':'Pedro'},
        {'id':2, 'name':'Guilherme'}]

df = spark.createDataFrame(data=data)

df.show()
df.printSchema()
+---+---------+
| id|     name|
+---+---------+
|  1|    Pedro|
|  2|Guilherme|
+---+---------+

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)