spark2_dfanalysis

Dataframe analysis in PySpark

View on GitHub

<!DOCTYPE html>

dev_basic_ops_2dataframes_Join

join_2dataframes

In [1]:
%load_ext autoreload
%autoreload 2

# The autoreload extension is already loaded. To reload it, use:
#  %reload_ext autoreload
In [2]:
import os
import sys
import numpy as np
import pandas as pd

pd.options.display.float_format = '{:8,.2f}'.format
In [3]:
# mylib:
my_library = os.path.expanduser('~/.myconfigs')
my_spark = os.path.expanduser('~/spark2_dfanalysis')
sys.path.append(my_library)
sys.path.append(my_spark)
In [4]:
import pyspark as spark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# spark = SparkSession.builder.appName('myappname').getOrCreate()
# print(spark)
In [5]:
# print(dir(pyspark))
# print(dir(pyspark.sql))
# print(dir(pyspark.rdd))
In [6]:
from shared.app_context import *

ctx = ApplicationContext("Dev-Job")
print(ctx.spark)
<pyspark.sql.session.SparkSession object at 0x000002000E2098D0>
In [7]:
# print(sys.path)
from builder.DataFrameBuild import *

DataFrameBuild Class:

x = DataFrameBuild(ctx.spark)

In [8]:
x = DataFrameBuild(ctx.spark)

DF1

In [22]:
x = DataFrameBuild(ctx.spark)
num = 500
df1 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,10000))],
                            ['index','passwords','quarter','month','price'])

DF2

In [23]:
x = DataFrameBuild(ctx.spark)
num = 500
df2 = x.arrays_to_dataframe([[int(x) for x in np.linspace(1,num,num)],
                             x.build_array("string",num=num,width=8),
                             x.build_array("integer",num=num,nrange=(1,4)),
                             x.build_array("integer",num=num,nrange=(1,12)),
                             x.build_array("double",num=num,nrange=(0.0,10000))],
                             ['index','passwords','quarter','month','price'])
In [24]:
df1.limit(10).show()
+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| xrugxzqt|      4|    2| 9169.325130652613|
|    2| xfkibuuk|      1|    1| 1257.733583089643|
|    3| rcbcqdio|      3|   12|2961.8063111136207|
|    4| kbntpqtr|      3|    9| 4902.258515296808|
|    5| jyyapygl|      3|   12| 334.3144742334825|
|    6| ilpeslcl|      4|    2| 4019.697755986953|
|    7| mpiosdaq|      4|    2| 1251.444157602367|
|    8| ozrbrshu|      2|    2| 4113.287043919263|
|    9| lavynxah|      2|    6| 6334.725063271436|
|   10| obovdjxc|      2|    1| 3850.804965001796|
+-----+---------+-------+-----+------------------+

In [25]:
df2.limit(10).show()
+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+
|    1| uzsozolw|      2|    6|2590.2398235673827|
|    2| ioieiepd|      2|    9| 4826.274269733417|
|    3| hyyoddgf|      4|    3| 3448.071954770697|
|    4| ypuftrjh|      1|   12| 622.5235948406471|
|    5| dxrjfwbo|      4|    3| 4907.760745104725|
|    6| aptqjlhl|      4|    5|  5014.79047899606|
|    7| pbyvsthp|      4|    4|1156.9331701466456|
|    8| asxgfiit|      2|    7|1024.9884466386773|
|    9| krymouph|      2|    5| 3369.522880393101|
|   10| vzfygdlg|      4|    2| 7579.167357164106|
+-----+---------+-------+-----+------------------+

Join df1 and df2 on indices.

In [60]:
df_index = df1.join(df2,df1.index == df2.index)
df_index = df1.join(df2,df1["index"] == df2["index"])
In [61]:
%%time
df_index.cache()
Wall time: 2 ms
Out[61]:
DataFrame[index: bigint, passwords: string, quarter: bigint, month: bigint, price: double, index: bigint, passwords: string, quarter: bigint, month: bigint, price: double]
In [62]:
df_index.show(20)
+-----+---------+-------+-----+------------------+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+-----+---------+-------+-----+------------------+
|   26| qerglufq|      2|    1|3030.4061156760918|   26| wxgpatxj|      1|    2|3387.6597958352263|
|   29| aigyphwn|      1|   10|  6252.80216631731|   29| oiemkyzy|      2|    1|4023.8255056872863|
|  474| udgmapre|      2|    4| 9749.675099684515|  474| rjowkrqh|      4|    9| 6632.458760756636|
|   65| bhlxckok|      3|    9| 7814.899323398169|   65| oygsbzid|      2|    3|3433.3381516495733|
|  191| szbxvtnz|      4|    9| 9491.417542858147|  191| osvytxei|      2|    9|1161.2357048689792|
|  418| pnpfiwrk|      3|   10| 7230.036629785022|  418| vaumhtcy|      4|    8|  36.0142648019568|
|  222| ccymeaxh|      1|    9|  5908.93062973059|  222| gcomxhhv|      3|    5| 8488.318332444042|
|  270| ddfjfxzt|      4|    3| 5970.621724729035|  270| skbjxrog|      1|    3| 7568.249534754637|
|  293| jknaddua|      2|    7| 5566.910796298438|  293| njekvwmh|      4|    8| 2770.976587685471|
|  243| rfrahdoj|      4|    3| 2508.249351565942|  243| zchczlnb|      2|    8| 4210.342994686196|
|  278| spcrvwgi|      4|    2|  1374.59222039512|  278| rfzsjvja|      4|    3| 5856.952043542598|
|  367| jxclrdyx|      2|    6| 8226.681964452644|  367| rkizrmwc|      3|   10| 7406.103465920653|
|  442| rwwuglhq|      2|   11| 7050.418869549039|  442| yxhhwqmg|      4|    1| 83.69776464811184|
|   19| cpldjaiv|      1|    2| 8293.457027686814|   19| mzahrhba|      4|    1| 9884.640549816078|
|   54| kfcplqyn|      3|    9| 9558.520838842534|   54| derokvpy|      2|   11| 9945.487081450286|
|  296| jdzelnus|      2|    4| 63.80729882147884|  296| okpsltwp|      3|   10| 4948.580670919397|
|  277| lqgljcsv|      2|    1|1694.3824389919105|  277| nufsfxdh|      3|    6| 5334.676839178573|
|  287| tsirclwb|      1|   10| 883.9196795238624|  287| mjojrjhk|      4|    8| 4094.679253981852|
|  348| iwmorpau|      3|   11| 8844.075521039702|  348| zgmfvxmo|      3|    8| 6478.862381798661|
|  415| wikukapk|      4|    8| 4738.217283211133|  415| gfswfbwz|      2|   10| 4116.937947551068|
+-----+---------+-------+-----+------------------+-----+---------+-------+-----+------------------+
only showing top 20 rows

In [58]:
df_index.columns
Out[58]:
['index',
 'passwords',
 'quarter',
 'month',
 'price',
 'index',
 'passwords',
 'quarter',
 'month',
 'price']

Join df1 and df2 on indices. (round 2)

In [71]:
df_i2 = df1.alias("o")\
.join(df2.alias("t"),col("o.index") == col("t.index"))

# df_index = df1.join(df2,df1["index"] == df2["index"])
In [72]:
%%time
df_i2.cache()
Wall time: 0 ns
Out[72]:
DataFrame[index: bigint, passwords: string, quarter: bigint, month: bigint, price: double, index: bigint, passwords: string, quarter: bigint, month: bigint, price: double]
In [73]:
df_i2.show(20)
+-----+---------+-------+-----+------------------+-----+---------+-------+-----+------------------+
|index|passwords|quarter|month|             price|index|passwords|quarter|month|             price|
+-----+---------+-------+-----+------------------+-----+---------+-------+-----+------------------+
|   26| qerglufq|      2|    1|3030.4061156760918|   26| wxgpatxj|      1|    2|3387.6597958352263|
|   29| aigyphwn|      1|   10|  6252.80216631731|   29| oiemkyzy|      2|    1|4023.8255056872863|
|  474| udgmapre|      2|    4| 9749.675099684515|  474| rjowkrqh|      4|    9| 6632.458760756636|
|   65| bhlxckok|      3|    9| 7814.899323398169|   65| oygsbzid|      2|    3|3433.3381516495733|
|  191| szbxvtnz|      4|    9| 9491.417542858147|  191| osvytxei|      2|    9|1161.2357048689792|
|  418| pnpfiwrk|      3|   10| 7230.036629785022|  418| vaumhtcy|      4|    8|  36.0142648019568|
|  222| ccymeaxh|      1|    9|  5908.93062973059|  222| gcomxhhv|      3|    5| 8488.318332444042|
|  270| ddfjfxzt|      4|    3| 5970.621724729035|  270| skbjxrog|      1|    3| 7568.249534754637|
|  293| jknaddua|      2|    7| 5566.910796298438|  293| njekvwmh|      4|    8| 2770.976587685471|
|  243| rfrahdoj|      4|    3| 2508.249351565942|  243| zchczlnb|      2|    8| 4210.342994686196|
|  278| spcrvwgi|      4|    2|  1374.59222039512|  278| rfzsjvja|      4|    3| 5856.952043542598|
|  367| jxclrdyx|      2|    6| 8226.681964452644|  367| rkizrmwc|      3|   10| 7406.103465920653|
|  442| rwwuglhq|      2|   11| 7050.418869549039|  442| yxhhwqmg|      4|    1| 83.69776464811184|
|   19| cpldjaiv|      1|    2| 8293.457027686814|   19| mzahrhba|      4|    1| 9884.640549816078|
|   54| kfcplqyn|      3|    9| 9558.520838842534|   54| derokvpy|      2|   11| 9945.487081450286|
|  296| jdzelnus|      2|    4| 63.80729882147884|  296| okpsltwp|      3|   10| 4948.580670919397|
|  277| lqgljcsv|      2|    1|1694.3824389919105|  277| nufsfxdh|      3|    6| 5334.676839178573|
|  287| tsirclwb|      1|   10| 883.9196795238624|  287| mjojrjhk|      4|    8| 4094.679253981852|
|  348| iwmorpau|      3|   11| 8844.075521039702|  348| zgmfvxmo|      3|    8| 6478.862381798661|
|  415| wikukapk|      4|    8| 4738.217283211133|  415| gfswfbwz|      2|   10| 4116.937947551068|
+-----+---------+-------+-----+------------------+-----+---------+-------+-----+------------------+
only showing top 20 rows

In [74]:
df_i2.columns
Out[74]:
['index',
 'passwords',
 'quarter',
 'month',
 'price',
 'index',
 'passwords',
 'quarter',
 'month',
 'price']

Create a new column calculated from current column values.

In [ ]: