import findspark
findspark.init()
Creating Dataframes
Based on WafaStudies PySpark tutorial.
Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
= SparkSession.builder\
spark 'Spark')\
.appName("local[*]")\
.master( .getOrCreate()
your 131072x1 screen size is bogus. expect trouble
23/10/25 15:10:15 WARN Utils: Your hostname, PC resolves to a loopback address: 127.0.1.1; using 172.29.148.244 instead (on interface eth0)
23/10/25 15:10:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/25 15:10:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Create Dataframe
help(spark.createDataFrame)
Help on method createDataFrame in module pyspark.sql.session:
createDataFrame(data: Union[pyspark.rdd.RDD[Any], Iterable[Any], ForwardRef('PandasDataFrameLike'), ForwardRef('ArrayLike')], schema: Union[pyspark.sql.types.AtomicType, pyspark.sql.types.StructType, str, NoneType] = None, samplingRatio: Optional[float] = None, verifySchema: bool = True) -> pyspark.sql.dataframe.DataFrame method of pyspark.sql.session.SparkSession instance
Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`
or a :class:`numpy.ndarray`.
.. versionadded:: 2.0.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
data : :class:`RDD` or iterable
an RDD of any kind of SQL data representation (:class:`Row`,
:class:`tuple`, ``int``, ``boolean``, etc.), or :class:`list`,
:class:`pandas.DataFrame` or :class:`numpy.ndarray`.
schema : :class:`pyspark.sql.types.DataType`, str or list, optional
a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
column names, default is None. The data type string format equals to
:class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
omit the ``struct<>``.
When ``schema`` is a list of column names, the type of each column
will be inferred from ``data``.
When ``schema`` is ``None``, it will try to infer the schema (column names and types)
from ``data``, which should be an RDD of either :class:`Row`,
:class:`namedtuple`, or :class:`dict`.
When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must
match the real data, or an exception will be thrown at runtime. If the given schema is
not :class:`pyspark.sql.types.StructType`, it will be wrapped into a
:class:`pyspark.sql.types.StructType` as its only field, and the field name will be
"value". Each record will also be wrapped into a tuple, which can be converted to row
later.
samplingRatio : float, optional
the sample ratio of rows used for inferring. The first few rows will be used
if ``samplingRatio`` is ``None``.
verifySchema : bool, optional
verify data types of every row against schema. Enabled by default.
.. versionadded:: 2.1.0
Returns
-------
:class:`DataFrame`
Notes
-----
Usage with `spark.sql.execution.arrow.pyspark.enabled=True` is experimental.
Examples
--------
Create a DataFrame from a list of tuples.
>>> spark.createDataFrame([('Alice', 1)]).show()
+-----+---+
| _1| _2|
+-----+---+
|Alice| 1|
+-----+---+
Create a DataFrame from a list of dictionaries.
>>> d = [{'name': 'Alice', 'age': 1}]
>>> spark.createDataFrame(d).show()
+---+-----+
|age| name|
+---+-----+
| 1|Alice|
+---+-----+
Create a DataFrame with column names specified.
>>> spark.createDataFrame([('Alice', 1)], ['name', 'age']).show()
+-----+---+
| name|age|
+-----+---+
|Alice| 1|
+-----+---+
Create a DataFrame with the explicit schema specified.
>>> from pyspark.sql.types import *
>>> schema = StructType([
... StructField("name", StringType(), True),
... StructField("age", IntegerType(), True)])
>>> spark.createDataFrame([('Alice', 1)], schema).show()
+-----+---+
| name|age|
+-----+---+
|Alice| 1|
+-----+---+
Create a DataFrame with the schema in DDL formatted string.
>>> spark.createDataFrame([('Alice', 1)], "name: string, age: int").show()
+-----+---+
| name|age|
+-----+---+
|Alice| 1|
+-----+---+
Create an empty DataFrame.
When initializing an empty DataFrame in PySpark, it's mandatory to specify its schema,
as the DataFrame lacks data from which the schema can be inferred.
>>> spark.createDataFrame([], "name: string, age: int").show()
+----+---+
|name|age|
+----+---+
+----+---+
Create a DataFrame from Row objects.
>>> from pyspark.sql import Row
>>> Person = Row('name', 'age')
>>> df = spark.createDataFrame([Person("Alice", 1)])
>>> df.show()
+-----+---+
| name|age|
+-----+---+
|Alice| 1|
+-----+---+
Create a DataFrame from a pandas DataFrame.
>>> spark.createDataFrame(df.toPandas()).show() # doctest: +SKIP
+-----+---+
| name|age|
+-----+---+
|Alice| 1|
+-----+---+
>>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP
+---+---+
| 0| 1|
+---+---+
| 1| 2|
+---+---+
= [(1,'Pedro'), (2, 'Guilherme')]
data = spark.createDataFrame(data=data)
df
df.show() df.printSchema()
+---+---------+
| _1| _2|
+---+---------+
| 1| Pedro|
| 2|Guilherme|
+---+---------+
root
|-- _1: long (nullable = true)
|-- _2: string (nullable = true)
= [(1,'Pedro'), (2, 'Guilherme')]
data = spark.createDataFrame(data=data, schema=["id", "name"])
df
df.show() df.printSchema()
+---+---------+
| id| name|
+---+---------+
| 1| Pedro|
| 2|Guilherme|
+---+---------+
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
from pyspark.sql.types import *
help(StructType)
Help on class StructType in module pyspark.sql.types:
class StructType(DataType)
| StructType(fields: Optional[List[pyspark.sql.types.StructField]] = None)
|
| Struct type, consisting of a list of :class:`StructField`.
|
| This is the data type representing a :class:`Row`.
|
| Iterating a :class:`StructType` will iterate over its :class:`StructField`\s.
| A contained :class:`StructField` can be accessed by its name or position.
|
| Examples
| --------
| >>> from pyspark.sql.types import *
| >>> struct1 = StructType([StructField("f1", StringType(), True)])
| >>> struct1["f1"]
| StructField('f1', StringType(), True)
| >>> struct1[0]
| StructField('f1', StringType(), True)
|
| >>> struct1 = StructType([StructField("f1", StringType(), True)])
| >>> struct2 = StructType([StructField("f1", StringType(), True)])
| >>> struct1 == struct2
| True
| >>> struct1 = StructType([StructField("f1", CharType(10), True)])
| >>> struct2 = StructType([StructField("f1", CharType(10), True)])
| >>> struct1 == struct2
| True
| >>> struct1 = StructType([StructField("f1", VarcharType(10), True)])
| >>> struct2 = StructType([StructField("f1", VarcharType(10), True)])
| >>> struct1 == struct2
| True
| >>> struct1 = StructType([StructField("f1", StringType(), True)])
| >>> struct2 = StructType([StructField("f1", StringType(), True),
| ... StructField("f2", IntegerType(), False)])
| >>> struct1 == struct2
| False
|
| The below example demonstrates how to create a DataFrame based on a struct created
| using class:`StructType` and class:`StructField`:
|
| >>> data = [("Alice", ["Java", "Scala"]), ("Bob", ["Python", "Scala"])]
| >>> schema = StructType([
| ... StructField("name", StringType()),
| ... StructField("languagesSkills", ArrayType(StringType())),
| ... ])
| >>> df = spark.createDataFrame(data=data, schema=schema)
| >>> df.printSchema()
| root
| |-- name: string (nullable = true)
| |-- languagesSkills: array (nullable = true)
| | |-- element: string (containsNull = true)
| >>> df.show()
| +-----+---------------+
| | name|languagesSkills|
| +-----+---------------+
| |Alice| [Java, Scala]|
| | Bob|[Python, Scala]|
| +-----+---------------+
|
| Method resolution order:
| StructType
| DataType
| builtins.object
|
| Methods defined here:
|
| __getitem__(self, key: Union[str, int]) -> pyspark.sql.types.StructField
| Access fields by name or slice.
|
| __init__(self, fields: Optional[List[pyspark.sql.types.StructField]] = None)
| Initialize self. See help(type(self)) for accurate signature.
|
| __iter__(self) -> Iterator[pyspark.sql.types.StructField]
| Iterate the fields
|
| __len__(self) -> int
| Return the number of fields.
|
| __repr__(self) -> str
| Return repr(self).
|
| add(self, field: Union[str, pyspark.sql.types.StructField], data_type: Union[str, pyspark.sql.types.DataType, NoneType] = None, nullable: bool = True, metadata: Optional[Dict[str, Any]] = None) -> 'StructType'
| Construct a :class:`StructType` by adding new elements to it, to define the schema.
| The method accepts either:
|
| a) A single parameter which is a :class:`StructField` object.
| b) Between 2 and 4 parameters as (name, data_type, nullable (optional),
| metadata(optional). The data_type parameter may be either a String or a
| :class:`DataType` object.
|
| Parameters
| ----------
| field : str or :class:`StructField`
| Either the name of the field or a :class:`StructField` object
| data_type : :class:`DataType`, optional
| If present, the DataType of the :class:`StructField` to create
| nullable : bool, optional
| Whether the field to add should be nullable (default True)
| metadata : dict, optional
| Any additional metadata (default None)
|
| Returns
| -------
| :class:`StructType`
|
| Examples
| --------
| >>> from pyspark.sql.types import IntegerType, StringType, StructField, StructType
| >>> struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
| >>> struct2 = StructType([StructField("f1", StringType(), True),
| ... StructField("f2", StringType(), True, None)])
| >>> struct1 == struct2
| True
| >>> struct1 = StructType().add(StructField("f1", StringType(), True))
| >>> struct2 = StructType([StructField("f1", StringType(), True)])
| >>> struct1 == struct2
| True
| >>> struct1 = StructType().add("f1", "string", True)
| >>> struct2 = StructType([StructField("f1", StringType(), True)])
| >>> struct1 == struct2
| True
|
| fieldNames(self) -> List[str]
| Returns all field names in a list.
|
| Examples
| --------
| >>> from pyspark.sql.types import StringType, StructField, StructType
| >>> struct = StructType([StructField("f1", StringType(), True)])
| >>> struct.fieldNames()
| ['f1']
|
| fromInternal(self, obj: Tuple) -> 'Row'
| Converts an internal SQL object into a native Python object.
|
| jsonValue(self) -> Dict[str, Any]
|
| needConversion(self) -> bool
| Does this type needs conversion between Python object and internal SQL object.
|
| This is used to avoid the unnecessary conversion for ArrayType/MapType/StructType.
|
| simpleString(self) -> str
|
| toInternal(self, obj: Tuple) -> Tuple
| Converts a Python object into an internal SQL object.
|
| ----------------------------------------------------------------------
| Class methods defined here:
|
| fromJson(json: Dict[str, Any]) -> 'StructType' from builtins.type
| Constructs :class:`StructType` from a schema defined in JSON format.
|
| Below is a JSON schema it must adhere to::
|
| {
| "title":"StructType",
| "description":"Schema of StructType in json format",
| "type":"object",
| "properties":{
| "fields":{
| "description":"Array of struct fields",
| "type":"array",
| "items":{
| "type":"object",
| "properties":{
| "name":{
| "description":"Name of the field",
| "type":"string"
| },
| "type":{
| "description": "Type of the field. Can either be
| another nested StructType or primitive type",
| "type":"object/string"
| },
| "nullable":{
| "description":"If nulls are allowed",
| "type":"boolean"
| },
| "metadata":{
| "description":"Additional metadata to supply",
| "type":"object"
| },
| "required":[
| "name",
| "type",
| "nullable",
| "metadata"
| ]
| }
| }
| }
| }
| }
|
| Parameters
| ----------
| json : dict or a dict-like object e.g. JSON object
| This "dict" must have "fields" key that returns an array of fields
| each of which must have specific keys (name, type, nullable, metadata).
|
| Returns
| -------
| :class:`StructType`
|
| Examples
| --------
| >>> json_str = '''
| ... {
| ... "fields": [
| ... {
| ... "metadata": {},
| ... "name": "Person",
| ... "nullable": true,
| ... "type": {
| ... "fields": [
| ... {
| ... "metadata": {},
| ... "name": "name",
| ... "nullable": false,
| ... "type": "string"
| ... },
| ... {
| ... "metadata": {},
| ... "name": "surname",
| ... "nullable": false,
| ... "type": "string"
| ... }
| ... ],
| ... "type": "struct"
| ... }
| ... }
| ... ],
| ... "type": "struct"
| ... }
| ... '''
| >>> import json
| >>> scheme = StructType.fromJson(json.loads(json_str))
| >>> scheme.simpleString()
| 'struct<Person:struct<name:string,surname:string>>'
|
| ----------------------------------------------------------------------
| Methods inherited from DataType:
|
| __eq__(self, other: Any) -> bool
| Return self==value.
|
| __hash__(self) -> int
| Return hash(self).
|
| __ne__(self, other: Any) -> bool
| Return self!=value.
|
| json(self) -> str
|
| ----------------------------------------------------------------------
| Class methods inherited from DataType:
|
| typeName() -> str from builtins.type
|
| ----------------------------------------------------------------------
| Data descriptors inherited from DataType:
|
| __dict__
| dictionary for instance variables (if defined)
|
| __weakref__
| list of weak references to the object (if defined)
= [(1,'Pedro'), (2, 'Guilherme')]
data = StructType([StructField(name='id', dataType=IntegerType()),
schema ='name', dataType=StringType())])
StructField(name
= spark.createDataFrame(data=data, schema=schema)
df
df.show() df.printSchema()
+---+---------+
| id| name|
+---+---------+
| 1| Pedro|
| 2|Guilherme|
+---+---------+
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
= [{'id':1, 'name':'Pedro'},
data 'id':2, 'name':'Guilherme'}]
{
= spark.createDataFrame(data=data)
df
df.show() df.printSchema()
+---+---------+
| id| name|
+---+---------+
| 1| Pedro|
| 2|Guilherme|
+---+---------+
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)