Dimensions

toolbox_pyspark.dimensions 🔗

Summary

The dimensions module is used for checking the dimensions of pyspark dataframe's.

get_dims 🔗

get_dims(
    dataframe: psDataFrame,
    use_names: bool = True,
    use_comma: bool = True,
) -> Union[
    dict[str, str],
    dict[str, int],
    tuple[str, str],
    tuple[int, int],
]

Summary

Extract the dimensions of a given dataframe.

Parameters:

Name	Type	Description	Default
`dataframe`	`DataFrame`	The table to check.	required
`use_names`	`bool`	Whether or not to add `names` to the returned object. If `True`, then will return a `dict` with two keys only, for the number of `rows` and `cols`. Defaults to `True`.	`True`
`use_comma`	`bool`	Whether or not to add a comma `,` to the returned object. Defaults to `True`.	`True`

Raises:

Type	Description
`TypeError`	If any of the inputs parsed to the parameters of this function are not the correct type. Uses the `@typeguard.typechecked` decorator.

Returns:

Type	Description
`Union[Dict[str, Union[str, int]], tuple[str, ...], tTuple[int, ...]]`	The dimensions of the given `dataframe`.

Examples

Set up
>>> # Imports
>>> import pandas as pd
>>> from pyspark.sql import SparkSession
>>> from toolbox_pyspark.dimensions import get_dims
>>>
>>> # Instantiate Spark
>>> spark = SparkSession.builder.getOrCreate()
>>>
>>> # Create data
>>> df = spark.createDataFrame(
...     pd.DataFrame({
...         'a': range(5000),
...         'b': range(5000),
...     })
... )
>>>
>>> # Check
>>> print(df.count())
>>> print(len(df.columns))

Terminal

Names and commas
>>> print(get_dims(dataframe=df, use_names=True, use_commas=True))

Terminal

{"rows": "5,000", "cols": "2"}

Names but no commas
>>> print(get_dims(dataframe=df, use_names=True, use_commas=False))

Terminal

{"rows": 5000, "cols": 2}

Commas but no names
>>> print(get_dims(dataframe=df, use_names=False, use_commas=True))

Terminal

("5,000", "2")

Neither names nor commas
>>> print(get_dims(dataframe=df, use_names=False, use_commas=False))

Terminal

(5000, 2)

Source code in src/toolbox_pyspark/dimensions.py

@typechecked
def get_dims(
    dataframe: psDataFrame,
    use_names: bool = True,
    use_comma: bool = True,
) -> Union[dict[str, str], dict[str, int], tuple[str, str], tuple[int, int]]:
    """
    !!! note "Summary"
        Extract the dimensions of a given `dataframe`.

    Params:
        dataframe (psDataFrame):
            The table to check.
        use_names (bool, optional):
            Whether or not to add `names` to the returned object.<br>
            If `#!py True`, then will return a `#!py dict` with two keys only, for the number of `rows` and `cols`.<br>
            Defaults to `#!py True`.
        use_comma (bool, optional):
            Whether or not to add a comma `,` to the returned object.<br>
            Defaults to `#!py True`.

    Raises:
        TypeError:
            If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

    Returns:
        (Union[Dict[str, Union[str, int]], tuple[str, ...], tTuple[int, ...]]):
            The dimensions of the given `dataframe`.

    ???+ example "Examples"

        ```{.py .python linenums="1" title="Set up"}
        >>> # Imports
        >>> import pandas as pd
        >>> from pyspark.sql import SparkSession
        >>> from toolbox_pyspark.dimensions import get_dims
        >>>
        >>> # Instantiate Spark
        >>> spark = SparkSession.builder.getOrCreate()
        >>>
        >>> # Create data
        >>> df = spark.createDataFrame(
        ...     pd.DataFrame({
        ...         'a': range(5000),
        ...         'b': range(5000),
        ...     })
        ... )
        >>>
        >>> # Check
        >>> print(df.count())
        >>> print(len(df.columns))
        ```
        <div class="result" markdown>
        ```{.txt .text title="Terminal"}
        5000
        ```
        ```{.txt .text title="Terminal"}
        2
        ```
        </div>

        ```{.py .python linenums="1" title="Names and commas"}
        >>> print(get_dims(dataframe=df, use_names=True, use_commas=True))
        ```
        <div class="result" markdown>
        ```{.sh .shell  title="Terminal"}
        {"rows": "5,000", "cols": "2"}
        ```
        </div>

        ```{.py .python linenums="1" title="Names but no commas"}
        >>> print(get_dims(dataframe=df, use_names=True, use_commas=False))
        ```
        <div class="result" markdown>
        ```{.sh .shell  title="Terminal"}
        {"rows": 5000, "cols": 2}
        ```
        </div>

        ```{.py .python linenums="1" title="Commas but no names"}
        >>> print(get_dims(dataframe=df, use_names=False, use_commas=True))
        ```
        <div class="result" markdown>
        ```{.sh .shell  title="Terminal"}
        ("5,000", "2")
        ```
        </div>

        ```{.py .python linenums="1" title="Neither names nor commas"}
        >>> print(get_dims(dataframe=df, use_names=False, use_commas=False))
        ```
        <div class="result" markdown>
        ```{.sh .shell  title="Terminal"}
        (5000, 2)
        ```
        </div>
    """
    dims: tuple[int, int] = (dataframe.count(), len(dataframe.columns))
    if use_names and use_comma:
        return {"rows": f"{dims[0]:,}", "cols": f"{dims[1]:,}"}
    elif use_names and not use_comma:
        return {"rows": dims[0], "cols": dims[1]}
    elif not use_names and use_comma:
        return (f"{dims[0]:,}", f"{dims[1]:,}")
    else:
        return dims

get_dims_of_tables 🔗

get_dims_of_tables(
    tables: str_list,
    scope: Optional[dict] = None,
    use_comma: bool = True,
) -> pdDataFrame

Summary

Take in a list of the names of some tables, and for each of them, check their dimensions.

Details

This function will check against the global() scope. So you need to be careful if you're dealing with massive amounts of data in memory.

Parameters:

Name	Type	Description	Default
`tables`	`str_list`	The list of the tables that will be checked.	required
`scope`	`dict`	This is the scope against which the tables will be checked. If `None`, then it will use the `global()` scope by default.. Defaults to `None`.	`None`
`use_comma`	`bool`	Whether or not the dimensions from the tables should be formatted as a string with a comma as the thousandths delimiter. Defaults to `True`.	`True`

Raises:

Type	Description
`TypeError`	If any of the inputs parsed to the parameters of this function are not the correct type. Uses the `@typeguard.typechecked` decorator.

Returns:

Type	Description
`DataFrame`	A `pandas` `dataframe` with four columns: `["table", "type", "rows", "cols"]`.

Examples

Set up
>>> # Imports
>>> import pandas as pd
>>> from pyspark.sql import SparkSession
>>> from toolbox_pyspark.dimensions import get_dims_of_tables, get_dims
>>>
>>> # Instantiate Spark
>>> spark = SparkSession.builder.getOrCreate()
>>>
>>> # Create data
>>> df1 = spark.createDataFrame(
...     pd.DataFrame({
...         'a': range(5000),
...         'b': range(5000),
...     })
... )
>>> df2 = spark.createDataFrame(
...     pd.DataFrame({
...         'a': range(10000),
...         'b': range(10000),
...         'c': range(10000),
...     })
... )
>>>
>>> # Check
>>> print(get_dims(df1))
>>> print(get_dims(df1))

Terminal

{"rows": "5000", "cols": "2"}

Terminal

{"rows": "10000", "cols": "3"}

Basic usage
>>> print(get_dims_of_tables(['df1', 'df2']))

  table type  rows cols
0   df1      5,000    2
1   df2      1,000    3

No commas
>>> print(get_dims_of_tables(['df1', 'df2'], use_commas=False))

  table type rows cols
0   df1      5000    2
1   df2      1000    3

Missing DF
>>> display(get_dims_of_tables(['df1', 'df2', 'df3'], use_comma=False))

  table type rows cols
0   df1      5000    2
1   df2      1000    3
1   df3       NaN  NaN

Notes

The first column of the returned table is the name of the table from the scope provided.
The second column of the returned table is the type of the table. That is, whether the table is one of ["prd", "arc", "acm"], which are for 'production', 'archive', accumulation' categories. This is designated by the table containing an underscore (_), and having a suffic of either one of: "prd", "arc", or "acm". If the table does not contain this info, then the value in this second column will just be blank.
If one of the tables given in the tables list does not exist in the scope, then the values given in the rows and cols columns will either be the values: np.nan or "Did not load".

Source code in src/toolbox_pyspark/dimensions.py

@typechecked
def get_dims_of_tables(
    tables: str_list,
    scope: Optional[dict] = None,
    use_comma: bool = True,
) -> pdDataFrame:
    """
    !!! note "Summary"
        Take in a list of the names of some tables, and for each of them, check their dimensions.

    ???+ abstract "Details"
        This function will check against the `#!py global()` scope. So you need to be careful if you're dealing with massive amounts of data in memory.

    Params:
        tables (str_list):
            The list of the tables that will be checked.
        scope (dict, optional):
            This is the scope against which the tables will be checked.<br>
            If `#!py None`, then it will use the `#!py global()` scope by default..<br>
            Defaults to `#!py None`.
        use_comma (bool, optional):
            Whether or not the dimensions from the tables should be formatted as a string with a comma as the thousandths delimiter.<br>
            Defaults to `#!py True`.

    Raises:
        TypeError:
            If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.

    Returns:
        (pdDataFrame):
            A `pandas` `dataframe` with four columns: `#!py ["table", "type", "rows", "cols"]`.

    ???+ example "Examples"

        ```{.py .python linenums="1" title="Set up"}
        >>> # Imports
        >>> import pandas as pd
        >>> from pyspark.sql import SparkSession
        >>> from toolbox_pyspark.dimensions import get_dims_of_tables, get_dims
        >>>
        >>> # Instantiate Spark
        >>> spark = SparkSession.builder.getOrCreate()
        >>>
        >>> # Create data
        >>> df1 = spark.createDataFrame(
        ...     pd.DataFrame({
        ...         'a': range(5000),
        ...         'b': range(5000),
        ...     })
        ... )
        >>> df2 = spark.createDataFrame(
        ...     pd.DataFrame({
        ...         'a': range(10000),
        ...         'b': range(10000),
        ...         'c': range(10000),
        ...     })
        ... )
        >>>
        >>> # Check
        >>> print(get_dims(df1))
        >>> print(get_dims(df1))
        ```
        <div class="result" markdown>
        ```{.txt .text title="Terminal"}
        {"rows": "5000", "cols": "2"}
        ```
        ```{.txt .text title="Terminal"}
        {"rows": "10000", "cols": "3"}
        ```
        </div>

        ```{.py .python linenums="1" title="Basic usage"}
        >>> print(get_dims_of_tables(['df1', 'df2']))
        ```
        <div class="result" markdown>
        ```{.txt .text}
          table type  rows cols
        0   df1      5,000    2
        1   df2      1,000    3
        ```
        </div>

        ```{.py .python linenums="1" title="No commas"}
        >>> print(get_dims_of_tables(['df1', 'df2'], use_commas=False))
        ```
        <div class="result" markdown>
        ```{.txt .text}
          table type rows cols
        0   df1      5000    2
        1   df2      1000    3
        ```
        </div>

        ```{.py .python linenums="1" title="Missing DF"}
        >>> display(get_dims_of_tables(['df1', 'df2', 'df3'], use_comma=False))
        ```
        <div class="result" markdown>
        ```{.txt .text}
          table type rows cols
        0   df1      5000    2
        1   df2      1000    3
        1   df3       NaN  NaN
        ```
        </div>

    ??? info "Notes"
        - The first column of the returned table is the name of the table from the `scope` provided.
        - The second column of the returned table is the `type` of the table. That is, whether the table is one of `#!py ["prd", "arc", "acm"]`, which are for 'production', 'archive', accumulation' categories. This is designated by the table containing an underscore (`_`), and having a suffic of either one of: `#!py "prd"`, `#!py "arc"`, or `#!py "acm"`. If the table does not contain this info, then the value in this second column will just be blank.
        - If one of the tables given in the `tables` list does not exist in the `scope`, then the values given in the `rows` and `cols` columns will either be the values: `#!py np.nan` or `#!py "Did not load"`.
    """
    sizes: Dict[str, list] = {
        "table": list(),
        "type": list(),
        "rows": list(),
        "cols": list(),
    }
    rows: Union[str, int, float]
    cols: Union[str, int, float]
    for tbl, typ in [
        (
            table.rsplit("_", 1)
            if "_" in table and table.endswith(("acm", "arc", "prd"))
            else (table, "")
        )
        for table in tables
    ]:
        try:
            tmp: psDataFrame = eval(
                f"{tbl}{f'_{typ}' if typ!='' else ''}",
                globals() if scope is None else scope,
            )
            rows, cols = get_dims(tmp, use_names=False, use_comma=use_comma)
        except Exception:
            if use_comma:
                rows = cols = "Did not load"
            else:
                rows = cols = np.nan
        sizes["table"].append(tbl)
        sizes["type"].append(typ)
        sizes["rows"].append(rows)
        sizes["cols"].append(cols)
    return pdDataFrame(sizes)

make_dimension_table 🔗

make_dimension_table(
    dataframe: psDataFrame,
    columns: Union[str, str_collection],
    index_prefix: str = "id",
) -> psDataFrame

Summary

Create a dimension table from the specified columns of a given pyspark dataframe.

Details

This function will create a dimension table from the specified columns of a given pyspark dataframe. The dimension table will contain the unique values of the specified columns, along with an index column that will be used to replace the original columns in the original dataframe.

index column will be named according to the index_prefix parameter. If only one column is specified, then the index column will be named according to the index_prefix parameter followed by the name of the column. If multiple columns are specified, then the index column will be named according to the index_prefix parameter only. The index column will be created by using the row_number() window function over the specified columns.

The dimension table will be created by selecting the specified columns from the original dataframe, then applying the distinct() function to get the unique values, and finally applying the row_number() window function to create the index column.

Parameters:

Name	Type	Description	Default
`dataframe`	`DataFrame`	The DataFrame to create the dimension table from.	required
`columns`	`Union[str, str_collection]`	The column(s) to include in the dimension table.	required
`index_prefix`	`str`	The prefix to use for the index column. Defaults to `"id"`.	`'id'`

Raises:

Type	Description
`TypeError`	If any of the inputs parsed to the parameters of this function are not the correct type. Uses the `@typeguard.typechecked` decorator.
`ColumnDoesNotExistError`	If any of the columns specified do not exist in the dataframe.

Returns:

Type	Description
`DataFrame`	The dimension table.

Examples

Set up
>>> # Imports
>>> import pandas as pd
>>> from pyspark.sql import SparkSession
>>> from toolbox_pyspark.dimensions import make_dimension_table
>>>
>>> # Instantiate Spark
>>> spark = SparkSession.builder.getOrCreate()
>>>
>>> # Create data
>>> df = spark.createDataFrame(
...     pd.DataFrame(
...         {
...             "a": [1, 2, 3, 4],
...             "b": ["a", "b", "c", "d"],
...             "c": [1, 1, 2, 2],
...             "d": ["a", "b", "b", "b"],
...             "e": ["x", "x", "y", "z"],
...         }
...     )
... )
>>>
>>> # Check
>>> df.show()

Terminal

+---+---+---+---+---+
| a | b | c | d | e |
+---+---+---+---+---+
| 1 | a | 1 | a | x |
| 2 | b | 1 | b | x |
| 3 | c | 2 | b | y |
| 4 | d | 2 | b | z |
+---+---+---+---+---+

Example 1: Create dimension table with single column
>>> dim_table = make_dimension_table(df, "d")
>>> dim_table.show()

Terminal

+------+---+
| id_d | d |
+------+---+
|    1 | a |
|    2 | b |
+------+---+

Conclusion: Successfully created dimension table with single column.

Example 2: Create dimension table with multiple columns
>>> dim_table = make_dimension_table(df, ["c", "d"])
>>> dim_table.show()

Terminal

+----+---+---+
| id | c | d |
+----+---+---+
|  1 | 1 | a |
|  2 | 1 | b |
|  3 | 2 | b |
+----+---+---+

Conclusion: Successfully created dimension table with multiple columns.

Example 3: Use different prefix
>>> dim_table = make_dimension_table(df, "e", "index")
>>> dim_table.show()

Terminal

+---------+---+
| index_e | e |
+---------+---+
|       1 | x |
|       2 | y |
|       3 | z |
+---------+---+

Conclusion: Successfully created dimension table with different prefix.

Example 4: Invalid column
>>> dim_table = make_dimension_table(df, "123")

Terminal

ColumnDoesNotExistError: Column '123' does not exist in the DataFrame.

Conclusion: Failed to create dimension table due to invalid column name.

replace_columns_with_dimension_id 🔗

replace_columns_with_dimension_id(
    fct_dataframe: psDataFrame,
    dim_dataframe: psDataFrame,
    cols_to_replace: Union[str, str_collection],
    dim_id_col: Optional[str] = None,
) -> psDataFrame

Summary

Replace the specified columns in a given pyspark dataframe with the corresponding dimension table IDs.

Details

This function will replace the specified columns in a given pyspark dataframe with the corresponding dimension table IDs. The dimension table IDs will be obtained by joining the dimension table with the original dataframe on the specified columns. The original columns will then be dropped from the original dataframe.

The dimension table IDs will be added to the original dataframe to replace the columns specified in cols_to_replace. The dimension table IDs will be obtained by joining the dimension table with the original dataframe on the specified columns.

The join will be performed using a left join, so that any rows in the original dataframe that do not have a corresponding row in the dimension table will have a null value for the dimension table ID. The original columns will be dropped from the original dataframe after the join. The resulting dataframe will have the same number of rows as the original dataframe, but with the specified columns replaced by the dimension table IDs.

Parameters:

Name	Type	Description	Default
`fct_dataframe`	`DataFrame`	The DataFrame to replace the columns in.	required
`dim_dataframe`	`DataFrame`	The dimension table containing the IDs.	required
`cols_to_replace`	`Union[str, str_collection]`	The column(s) to replace with the dimension table IDs.	required
`dim_id_col`	`str`	The name of the column in the dimension table containing the IDs. If `None`, then will use the first column of the dimension table. Defaults to `None`.	`None`

Raises:

Type	Description
`TypeError`	If any of the inputs parsed to the parameters of this function are not the correct type. Uses the `@typeguard.typechecked` decorator.
`ColumnDoesNotExistError`	If any of the columns specified do not exist in the dataframes.

Returns:

Type	Description
`DataFrame`	The DataFrame with the columns replaced by the dimension table IDs.

Examples

Set up
>>> # Imports
>>> import pandas as pd
>>> from pyspark.sql import SparkSession
>>>
>>> # Instantiate Spark
>>> from toolbox_pyspark.dimensions import make_dimension_table, replace_columns_with_dimension_id
>>> spark = SparkSession.builder.getOrCreate()
>>>
>>> # Create data
>>> df = spark.createDataFrame(
...     pd.DataFrame(
...         {
...             "a": [1, 2, 3, 4],
...             "b": ["a", "b", "c", "d"],
...             "c": [1, 1, 2, 2],
...             "d": ["a", "b", "b", "b"],
...             "e": ["x", "x", "y", "z"],
...         }
...     )
... )
>>> dim_table1 = make_dimension_table(df, "d")
>>> dim_table2 = make_dimension_table(df, "e")
>>> dim_table3 = make_dimension_table(df, ("c", "d"))
>>>
>>> # Check
>>> df.show()
>>> dim_table1.show()
>>> dim_table2.show()
>>> dim_table3.show()

Terminal

+---+---+---+---+---+
| a | b | c | d | e |
+---+---+---+---+---+
| 1 | a | 1 | a | x |
| 2 | b | 1 | b | x |
| 3 | c | 2 | b | y |
| 4 | d | 2 | b | z |
+---+---+---+---+---+

Terminal

+------+---+
| id_d | d |
+------+---+
|    1 | a |
|    2 | b |
+------+---+

Terminal

+------+---+
| id_e | e |
+------+---+
|    1 | x |
|    2 | y |
|    3 | z |
+------+---+

Terminal

+----+---+---+
| id | c | d |
+----+---+---+
|  1 | 1 | a |
|  2 | 1 | b |
|  3 | 2 | b |
+----+---+---+

Example 1: Replace single column with dimension ID
>>> df_replaced = replace_columns_with_dimension_id(df, dim_table1, "d")
>>> df_replaced.show()

Terminal

+---+---+---+------+---+
| a | b | c | id_d | e |
+---+---+---+------+---+
| 1 | a | 1 |    1 | x |
| 2 | b | 1 |    2 | x |
| 3 | c | 2 |    2 | y |
| 4 | d | 2 |    2 | z |
+---+---+---+------+---+

Conclusion: Successfully replaced single column with dimension ID.

Example 2: Replace single column with dimension ID
>>> df_replaced = replace_columns_with_dimension_id(df, dim_table2, "e")
>>> df_replaced.show()

Terminal

+---+---+---+---+------+
| a | b | c | d | id_e |
+---+---+---+---+------+
| 1 | a | 1 | a |    1 |
| 2 | b | 1 | b |    1 |
| 3 | c | 2 | b |    2 |
| 4 | d | 2 | b |    3 |
+---+---+---+---+------+

Conclusion: Successfully replaced single column with dimension ID.

Example 3: Replace multiple columns with dimension IDs
>>> df_replaced_multi = replace_columns_with_dimension_id(df, dim_table3, ["c", "d"])
>>> df_replaced_multi.show()

Terminal

+---+---+----+---+
| a | b | id | e |
+---+---+----+---+
| 1 | a |  1 | x |
| 2 | b |  2 | x |
| 3 | c |  3 | y |
| 4 | d |  3 | z |
+---+---+----+---+

Conclusion: Successfully replaced multiple columns with dimension IDs.

Example 4: Invalid column type
>>> df_replaced = replace_columns_with_dimension_id(df, dim_table, "123")

Terminal

ColumnDoesNotExistError: Column '123' does not exist in the DataFrame.

Conclusion: Failed to replace columns due to invalid column type.

See Also

make_dimension_table

Source code in src/toolbox_pyspark/dimensions.py

@typechecked
def replace_columns_with_dimension_id(
    fct_dataframe: psDataFrame,
    dim_dataframe: psDataFrame,
    cols_to_replace: Union[str, str_collection],
    dim_id_col: Optional[str] = None,
) -> psDataFrame:
    """
    !!! note "Summary"
        Replace the specified columns in a given `pyspark` dataframe with the corresponding dimension table IDs.

    ???+ abstract "Details"
        This function will replace the specified columns in a given `pyspark` dataframe with the corresponding dimension table IDs. The dimension table IDs will be obtained by joining the dimension table with the original dataframe on the specified columns. The original columns will then be dropped from the original dataframe.

        The dimension table IDs will be added to the original dataframe to replace the columns specified in `cols_to_replace`. The dimension table IDs will be obtained by joining the dimension table with the original dataframe on the specified columns.

        The join will be performed using a left join, so that any rows in the original dataframe that do not have a corresponding row in the dimension table will have a `#!sql null` value for the dimension table ID. The original columns will be dropped from the original dataframe after the join.         The resulting dataframe will have the same number of rows as the original dataframe, but with the specified columns replaced by the dimension table IDs.

    Params:
        fct_dataframe (psDataFrame):
            The DataFrame to replace the columns in.
        dim_dataframe (psDataFrame):
            The dimension table containing the IDs.
        cols_to_replace (Union[str, str_collection]):
            The column(s) to replace with the dimension table IDs.
        dim_id_col (str, optional):
            The name of the column in the dimension table containing the IDs.<br>
            If `#!py None`, then will use the first column of the dimension table.<br>
            Defaults to `#!py None`.

    Raises:
        TypeError:
            If any of the inputs parsed to the parameters of this function are not the correct type. Uses the [`@typeguard.typechecked`](https://typeguard.readthedocs.io/en/stable/api.html#typeguard.typechecked) decorator.
        ColumnDoesNotExistError:
            If any of the columns specified do not exist in the dataframes.

    Returns:
        (psDataFrame):
            The DataFrame with the columns replaced by the dimension table IDs.

    ???+ example "Examples"

        ```{.py .python linenums="1" title="Set up"}
        >>> # Imports
        >>> import pandas as pd
        >>> from pyspark.sql import SparkSession
        >>>
        >>> # Instantiate Spark
        >>> from toolbox_pyspark.dimensions import make_dimension_table, replace_columns_with_dimension_id
        >>> spark = SparkSession.builder.getOrCreate()
        >>>
        >>> # Create data
        >>> df = spark.createDataFrame(
        ...     pd.DataFrame(
        ...         {
        ...             "a": [1, 2, 3, 4],
        ...             "b": ["a", "b", "c", "d"],
        ...             "c": [1, 1, 2, 2],
        ...             "d": ["a", "b", "b", "b"],
        ...             "e": ["x", "x", "y", "z"],
        ...         }
        ...     )
        ... )
        >>> dim_table1 = make_dimension_table(df, "d")
        >>> dim_table2 = make_dimension_table(df, "e")
        >>> dim_table3 = make_dimension_table(df, ("c", "d"))
        >>>
        >>> # Check
        >>> df.show()
        >>> dim_table1.show()
        >>> dim_table2.show()
        >>> dim_table3.show()
        ```
        <div class="result" markdown>
        ```{.txt .text title="Terminal"}
        +---+---+---+---+---+
        | a | b | c | d | e |
        +---+---+---+---+---+
        | 1 | a | 1 | a | x |
        | 2 | b | 1 | b | x |
        | 3 | c | 2 | b | y |
        | 4 | d | 2 | b | z |
        +---+---+---+---+---+
        ```
        ```{.txt .text title="Terminal"}
        +------+---+
        | id_d | d |
        +------+---+
        |    1 | a |
        |    2 | b |
        +------+---+
        ```
        ```{.txt .text title="Terminal"}
        +------+---+
        | id_e | e |
        +------+---+
        |    1 | x |
        |    2 | y |
        |    3 | z |
        +------+---+
        ```
        ```{.txt .text title="Terminal"}
        +----+---+---+
        | id | c | d |
        +----+---+---+
        |  1 | 1 | a |
        |  2 | 1 | b |
        |  3 | 2 | b |
        +----+---+---+
        ```
        </div>

        ```{.py .python linenums="1" title="Example 1: Replace single column with dimension ID"}
        >>> df_replaced = replace_columns_with_dimension_id(df, dim_table1, "d")
        >>> df_replaced.show()
        ```
        <div class="result" markdown>
        ```{.txt .text title="Terminal"}
        +---+---+---+------+---+
        | a | b | c | id_d | e |
        +---+---+---+------+---+
        | 1 | a | 1 |    1 | x |
        | 2 | b | 1 |    2 | x |
        | 3 | c | 2 |    2 | y |
        | 4 | d | 2 |    2 | z |
        +---+---+---+------+---+
        ```
        !!! success "Conclusion: Successfully replaced single column with dimension ID."
        </div>

        ```{.py .python linenums="1" title="Example 2: Replace single column with dimension ID"}
        >>> df_replaced = replace_columns_with_dimension_id(df, dim_table2, "e")
        >>> df_replaced.show()
        ```
        <div class="result" markdown>
        ```{.txt .text title="Terminal"}
        +---+---+---+---+------+
        | a | b | c | d | id_e |
        +---+---+---+---+------+
        | 1 | a | 1 | a |    1 |
        | 2 | b | 1 | b |    1 |
        | 3 | c | 2 | b |    2 |
        | 4 | d | 2 | b |    3 |
        +---+---+---+---+------+
        ```
        !!! success "Conclusion: Successfully replaced single column with dimension ID."
        </div>

        ```{.py .python linenums="1" title="Example 3: Replace multiple columns with dimension IDs"}
        >>> df_replaced_multi = replace_columns_with_dimension_id(df, dim_table3, ["c", "d"])
        >>> df_replaced_multi.show()
        ```
        <div class="result" markdown>
        ```{.txt .text title="Terminal"}
        +---+---+----+---+
        | a | b | id | e |
        +---+---+----+---+
        | 1 | a |  1 | x |
        | 2 | b |  2 | x |
        | 3 | c |  3 | y |
        | 4 | d |  3 | z |
        +---+---+----+---+
        ```
        !!! success "Conclusion: Successfully replaced multiple columns with dimension IDs."
        </div>

        ```{.py .python linenums="1" title="Example 4: Invalid column type"}
        >>> df_replaced = replace_columns_with_dimension_id(df, dim_table, "123")
        ```
        <div class="result" markdown>
        ```{.txt .text title="Terminal"}
        ColumnDoesNotExistError: Column '123' does not exist in the DataFrame.
        ```
        !!! failure "Conclusion: Failed to replace columns due to invalid column type."
        </div>

    ??? tip "See Also"
        - [`make_dimension_table`][toolbox_pyspark.dimensions.make_dimension_table]
    """

    # Generate variables ----
    cols_to_replace: str_list = (
        [cols_to_replace] if is_type(cols_to_replace, str) else list(cols_to_replace)
    )
    fct_cols: str_list = fct_dataframe.columns
    dim_cols: str_list = dim_dataframe.columns
    dim_id_col = dim_id_col or dim_cols[0]

    # Check variables ----
    assert_columns_exists(fct_dataframe, cols_to_replace)
    assert_columns_exists(dim_dataframe, cols_to_replace)
    assert_column_exists(dim_dataframe, dim_id_col)

    # Perform the replacement ----
    index_of_first_col: int = fct_cols.index(cols_to_replace[0])
    fct_new_cols: str_list = deepcopy(fct_cols)
    fct_new_cols = [
        *fct_new_cols[:index_of_first_col],
        dim_id_col,
        *fct_new_cols[index_of_first_col + 1 :],
    ]
    fct_removed_cols: str_list = [col for col in fct_new_cols if col not in cols_to_replace]

    # Return ----
    return (
        fct_dataframe.alias("a")
        .join(
            other=dim_dataframe.alias("b"),
            on=[F.col(f"a.{col}") == F.col(f"b.{col}") for col in cols_to_replace],
            how="left",
        )
        .select("a.*", f"b.{dim_id_col}")
        .select(*fct_removed_cols)
    )