Assignment7.pdf

D

data science

Assignment 7.1.a
pyarrow.Table

airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao:
string, callsign: string, country: string, active: bool>

child 0, airline_id: int64

child 1, name: string

child 2, alias: string

child 3, iata: string

In [93]: import os

import json

from pathlib import Path

import gzip

import hashlib

import shutil

import pandas as pd

import pygeohash

import s3fs



endpoint_url='https://storage.budsc.midwest-datascience.com'

current_dir = Path(os.getcwd()).absolute()

results_dir = current_dir.joinpath('results')



if results_dir.exists():

shutil.rmtree(results_dir)

results_dir.mkdir(parents=True, exist_ok=True)



def read_jsonl_data():

s3 = s3fs.S3FileSystem(

anon=True,

client_kwargs={

'endpoint_url': endpoint_url

}

)

src_data_path = 'data/processed/openflights/routes.jsonl.gz'

with s3.open(src_data_path, 'rb') as f_gz:

with gzip.open(f_gz, 'rb') as f:

records = [json.loads(line) for line in f.readlines()]

return records

In [94]: from pyarrow.json import read_json

import pyarrow.parquet as pq

def create_parquet_dataset():

src_data_path = 'data/processed/openflights/routes.jsonl.gz'

parquet_output_path = results_dir.joinpath('routes.parquet')

s3 = s3fs.S3FileSystem(

anon=True,

client_kwargs={

'endpoint_url': endpoint_url

}

)



with s3.open(src_data_path, 'rb') as f_gz:

with gzip.open(f_gz, 'rb') as f:

## TODO: Use Apache Arrow to create Parquet table and save the dataset

table = read_json(f)

print(table)

pq.write_table(table, parquet_output_path, compression='none')





create_parquet_dataset()
child 4, icao: string

child 5, callsign: string

child 6, country: string

child 7, active: bool

src_airport: struct<airport_id: int64, name: string, city: string, country: string,
iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>

child 0, airport_id: int64

child 1, name: string

child 2, city: string

child 3, country: string

child 4, iata: string

child 5, icao: string

child 6, latitude: double

child 7, longitude: double

child 8, altitude: int64

child 9, timezone: double

child 10, dst: string

child 11, tz_id: string

child 12, type: string

child 13, source: string

dst_airport: struct<airport_id: int64, name: string, city: string, country: string,
iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>

child 0, airport_id: int64

child 1, name: string

child 2, city: string

child 3, country: string

child 4, iata: string

child 5, icao: string

child 6, latitude: double

child 7, longitude: double

child 8, altitude: int64

child 9, timezone: double

child 10, dst: string

child 11, tz_id: string

child 12, type: string

child 13, source: string

codeshare: bool

equipment: list<item: string>

child 0, item: string

['codeshare', 'equipment', 'airline.airline_id', 'airline.name', 'airline.alias', 'a
irline.iata', 'airline.icao', 'airline.callsign', 'airline.country', 'airline.activ
e', 'src_airport.airport_id', 'src_airport.name', 'src_airport.city', 'src_airport.c
ountry', 'src_airport.iata', 'src_airport.icao', 'src_airport.latitude', 'src_airpor
t.longitude', 'src_airport.altitude', 'src_airport.timezone', 'src_airport.dst', 'sr
c_airport.tz_id', 'src_airport.type', 'src_airport.source', 'dst_airport.airport_i
d', 'dst_airport.name', 'dst_airport.city', 'dst_airport.country', 'dst_airport.iat
a', 'dst_airport.icao', 'dst_airport.latitude', 'dst_airport.longitude', 'dst_airpor
t.altitude', 'dst_airport.timezone', 'dst_airport.dst', 'dst_airport.tz_id', 'dst_ai
rport.type', 'dst_airport.source']
In [95]: parquet_output_path = results_dir.joinpath('routes.parquet')

pq = pd.read_parquet(parquet_output_path, engine='fastparquet')

print(list(pq.columns.values))

In [96]: partitions = (

('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'),

('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'),

('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'),

('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z')

)

In [97]: partitions_keys = (

'A', 'B', 'C-D', 'E-F',

'G-H', 'I-J', 'K-L', 'M',

'N', 'O-P', 'Q-R', 'S-T',
{'A': ('A', 'A'), 'B': ('B', 'B'), 'C-D': ('C', 'D'), 'E-F': ('E', 'F'), 'G-H':
('G', 'H'), 'I-J': ('I', 'J'), 'K-L': ('K', 'L'), 'M': ('M', 'M'), 'N': ('N', 'N'),
'O-P': ('O', 'P'), 'Q-R': ('Q', 'R'), 'S-T': ('S', 'T'), 'U': ('U', 'U'), 'V': ('V',
'V'), 'W-X': ('W', 'X'), 'Y-Z': ('Y', 'Z')}

Assignment 7.1.b
codeshare equipment airline.airline_id airline.name airline.alias airline.iata airline.icao airline.
0 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
1 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
2 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
3 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
4 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
5 rows × 41 columns
'U', 'V', 'W-X', 'Y-Z'

)

In [98]: parts_k_v = dict(zip(partitions_keys, partitions))

print(parts_k_v)

In [99]: def get_key(val):

for key, value in parts_k_v.items():

if val in value:

return key

return "0"

In [100… pq['key'] = pq['src_airport.iata'] + pq['dst_airport.iata'] + pq['airline.iata']

pq['partition_value'] = pq['key'].str[:1]

pq['kv_key'] = pq.apply(lambda x: get_key(x.partition_value), axis=1)

In [101… # remove invalid keys

pq = pq[pq.kv_key != "0"].astype('float32', errors='ignore')

In [102… pq.head()

Out[102…
In [104… import pyarrow as pa

import pyarrow.parquet as parpq

pq_tab = pa.Table.from_pandas(pq)



parpq.write_to_dataset(

pq_tab,

root_path=results_dir.joinpath('kv'),

partition_cols=['kv_key'],

)
Assignment 7.1.c
{'west': 'c21g6s0rs4c7', 'central': '9z7dnebnj8kb', 'east': 'dqby34cjw922'}

Assignment 7.1.d
In [105… import hashlib



def hash_key(key):

m = hashlib.sha256()

m.update(str(key).encode('utf-8'))

return m.hexdigest()

In [106… pq['key'] = pq['src_airport.iata']+pq['dst_airport.iata']+pq['airline.iata']

pq['hashed'] = pq.apply(lambda x: hash_key(x.key), axis=1)

pq['hash_key'] = pq['hashed'].str[:1]

In [107… pq_tab1 = pa.Table.from_pandas(pq)



parpq.write_to_dataset(

pq_tab1,

root_path=results_dir.joinpath('hash'),

partition_cols=['hash_key'],

)

In [109… #get hash for datacenters

datacenters = {}



datacenters['west'] = pygeohash.encode(45.5945645, -121.1786823)

datacenters['central'] = pygeohash.encode(41.1544433, -96.0422378)

datacenters['east'] = pygeohash.encode(39.08344, -77.6497145)



print(datacenters)

In [110… def closest_datacenter(latitude, longitude):

geohash = pygeohash.encode(latitude, longitude)

dist_dict = {}

closest_datacenter = ''

last_distance = None

for key, value in datacenters.items():

dist = pygeohash.geohash_approximate_distance(str(geohash), str(value))

dist_dict[key] = dist

if (last_distance == None) or (dist < last_distance):
closest_datacenter = key

last_distance = dist



return closest_datacenter

In [113… pq['datacenter'] = pq[['src_airport.latitude', 'src_airport.longitude']].apply(lambd
In [114… pq_tab2 = pa.Table.from_pandas(pq)



parpq.write_to_dataset(

pq_tab2,

root_path=results_dir.joinpath('geo'),

partition_cols=['datacenter'],

)
0 410.0

1 410.0

2 410.0

3 410.0

4 410.0

Name: airline.airline_id, dtype: float32
[{-1.0: 1}, {10.0: 1}, {21.0: 1}, {24.0: 1}, {28.0: 1}, {29.0: 1}, {32.0: 1}, {35.0:
1}, {42.0: 1}, {43.0: 1}, {55.0: 1}, {68.0: 1}, {83.0: 1}, {90.0: 1}, {96.0: 1}, {10
6.0: 1}, {109.0: 1}, {116.0: 1}, {125.0: 1}, {130.0: 1}, {132.0: 1}, {137.0: 1}, {13
9.0: 1}, {146.0: 1}, {153.0: 1}, {179.0: 1}, {197.0: 1}, {214.0: 1}, {218.0: 1}, {22
0.0: 1}, {221.0: 1}, {225.0: 1}, {231.0: 1}, {240.0: 1}, {241.0: 1}, {242.0: 1}, {24
6.0: 1}, {312.0: 1}, {316.0: 1}, {319.0: 1}, {321.0: 1}, {324.0: 1}, {328.0: 1}, {32
9.0: 1}, {330.0: 1}, {333.0: 1}, {336.0: 1}, {338.0: 1}, {341.0: 1}, {345.0: 1}, {38
6.0: 1}, {397.0: 1}, {410.0: 1}, {412.0: 1}, {426.0: 1}, {439.0: 1}, {442.0: 1}, {46
2.0: 1}, {470.0: 1}, {476.0: 1}, {477.0: 1}, {491.0: 1}, {502.0: 1}, {503.0: 1}, {50
8.0: 1}, {515.0: 1}, {524.0: 1}, {543.0: 1}, {563.0: 1}, {567.0: 1}, {569.0: 1}, {57
6.0: 1}, {595.0: 1}, {596.0: 1}, {603.0: 1}, {608.0: 1}, {622.0: 1}, {641.0: 1}, {68
3.0: 1}, {690.0: 2}, {692.0: 2}, {751.0: 2}, {753.0: 2}, {794.0: 2}, {807.0: 2}, {83
7.0: 2}, {879.0: 2}, {881.0: 2}, {882.0: 2}, {896.0: 2}, {897.0: 2}, {921.0: 2}, {97
0.0: 2}, {995.0: 2}, {998.0: 2}, {1006.0: 2}, {1008.0: 2}, {1034.0: 2}, {1048.0: 2},
{1057.0: 2}, {1066.0: 2}, {1073.0: 2}, {1109.0: 2}, {1173.0: 2}, {1191.0: 2}, {1203.
0: 2}, {1206.0: 2}, {1230.0: 2}, {1266.0: 2}, {1287.0: 2}, {1290.0: 2}, {1299.0: 2},
{1308.0: 2}, {1316.0: 2}, {1317.0: 2}, {1338.0: 2}, {1340.0: 2}, {1355.0: 2}, {1359.
0: 2}, {1392.0: 2}, {1401.0: 2}, {1403.0: 2}, {1422.0: 2}, {1434.0: 2}, {1441.0: 2},
{1463.0: 2}, {1469.0: 2}, {1472.0: 2}, {1478.0: 2}, {1492.0: 2}, {1500.0: 2}, {1508.
0: 2}, {1531.0: 2}, {1539.0: 2}, {1548.0: 2}, {1581.0: 2}, {1611.0: 2}, {1623.0: 2},
{1629.0: 2}, {1654.0: 2}, {1663.0: 2}, {1669.0: 2}, {1680.0: 2}, {1682.0: 2}, {1683.
0: 2}, {1729.0: 2}, {1750.0: 2}, {1756.0: 2}, {1758.0: 2}, {1767.0: 2}, {1769.0: 2},
{1775.0: 2}, {1790.0: 2}, {1792.0: 2}, {1829.0: 2}, {1844.0: 2}, {1868.0: 2}, {1886.
0: 2}, {1889.0: 3}, {1908.0: 3}, {1909.0: 3}, {1925.0: 3}, {1936.0: 3}, {1942.0: 3},
{1943.0: 3}, {1946.0: 3}, {1954.0: 3}, {1966.0: 3}, {2009.0: 3}, {2056.0: 3}, {2058.
0: 3}, {2091.0: 3}, {2094.0: 3}, {2104.0: 3}, {2117.0: 3}, {2143.0: 3}, {2150.0: 3},
{2183.0: 3}, {2193.0: 3}, {2217.0: 3}, {2218.0: 3}, {2220.0: 3}, {2222.0: 3}, {2226.
0: 3}, {2245.0: 3}, {2260.0: 3}, {2264.0: 3}, {2293.0: 3}, {2297.0: 3}, {2324.0: 3},
{2350.0: 3}, {2353.0: 3}, {2354.0: 3}, {2395.0: 3}, {2409.0: 3}, {2417.0: 3}, {2418.
0: 3}, {2419.0: 3}, {2420.0: 3}, {2421.0: 3}, {2439.0: 3}, {2468.0: 3}, {2520.0: 3},
{2524.0: 3}, {2538.0: 3}, {2541.0: 3}, {2547.0: 3}, {2548.0: 3}, {2575.0: 3}, {2585.
0: 3}, {2607.0: 3}, {2622.0: 3}, {2638.0: 3}, {2660.0: 3}, {2681.0: 3}, {2682.0: 3},
{2684.0: 3}, {2688.0: 3}, {2692.0: 3}, {2731.0: 3}, {2748.0: 3}, {2750.0: 3}, {2757.
0: 3}, {2765.0: 3}, {2773.0: 3}, {2774.0: 3}, {2822.0: 3}, {2825.0: 3}, {2826.0: 3},
{2835.0: 3}, {2850.0: 3}, {2857.0: 3}, {2881.0: 3}, {2896.0: 3}, {2916.0: 3}, {2922.
0: 3}, {2923.0: 3}, {2942.0: 4}, {2951.0: 4}, {2954.0: 4}, {2987.0: 4}, {2989.0: 4},
In [128… pq['airline.airline_id'].head()

Out[128…
In [133… def balance_partitions(keys, num_partitions):

ac = keys.cumsum()



#sum of the entire array

partsum = ac[-1]//num_partitions 



#generates the cumulative sums of each part

cum_part_sums = np.array(range(1,p))*partsum



#finds the indices 

inds = np.searchsorted(ac,cum_part_sums) 



#split into approximately equal-sum arrays

parts = np.split(arr,inds)



return parts

In [134… keys = list(pq['airline.airline_id'])

num_partitions=7

In [135… print(balance_partitions(keys, num_partitions))
{2990.0: 4}, {2993.0: 4}, {2994.0: 4}, {3000.0: 4}, {3021.0: 4}, {3026.0: 4}, {3029.
0: 4}, {3052.0: 4}, {3081.0: 4}, {3090.0: 4}, {3097.0: 4}, {3123.0: 4}, {3126.0: 4},
{3148.0: 4}, {3163.0: 4}, {3179.0: 4}, {3197.0: 4}, {3200.0: 4}, {3201.0: 4}, {3210.
0: 4}, {3233.0: 4}, {3251.0: 4}, {3258.0: 4}, {3287.0: 4}, {3290.0: 4}, {3320.0: 4},
{3329.0: 4}, {3342.0: 4}, {3354.0: 4}, {3370.0: 4}, {3378.0: 4}, {3386.0: 4}, {3391.
0: 4}, {3392.0: 4}, {3393.0: 4}, {3432.0: 4}, {3437.0: 4}, {3463.0: 4}, {3490.0: 4},
{3498.0: 4}, {3534.0: 4}, {3539.0: 4}, {3545.0: 4}, {3547.0: 4}, {3574.0: 4}, {3589.
0: 4}, {3613.0: 4}, {3618.0: 4}, {3637.0: 4}, {3652.0: 4}, {3661.0: 4}, {3674.0: 4},
{3721.0: 4}, {3734.0: 4}, {3737.0: 4}, {3740.0: 4}, {3754.0: 4}, {3764.0: 4}, {3776.
0: 4}, {3778.0: 4}, {3781.0: 4}, {3783.0: 4}, {3788.0: 4}, {3805.0: 4}, {3811.0: 4},
{3826.0: 4}, {3834.0: 4}, {3835.0: 4}, {3850.0: 4}, {3856.0: 4}, {3857.0: 4}, {3865.
0: 4}, {3871.0: 4}, {3926.0: 4}, {3935.0: 5}, {3952.0: 5}, {3969.0: 5}, {3976.0: 5},
{4021.0: 5}, {4026.0: 5}, {4031.0: 5}, {4044.0: 5}, {4066.0: 5}, {4089.0: 5}, {4091.
0: 5}, {4165.0: 5}, {4178.0: 5}, {4234.0: 5}, {4248.0: 5}, {4255.0: 5}, {4259.0: 5},
{4292.0: 5}, {4296.0: 5}, {4304.0: 5}, {4305.0: 5}, {4311.0: 5}, {4319.0: 5}, {4329.
0: 5}, {4335.0: 5}, {4349.0: 5}, {4356.0: 5}, {4375.0: 5}, {4388.0: 5}, {4429.0: 5},
{4435.0: 5}, {4436.0: 5}, {4438.0: 5}, {4454.0: 5}, {4475.0: 5}, {4496.0: 5}, {4513.
0: 5}, {4521.0: 5}, {4533.0: 5}, {4547.0: 5}, {4550.0: 5}, {4559.0: 5}, {4573.0: 5},
{4599.0: 5}, {4608.0: 5}, {4609.0: 5}, {4611.0: 5}, {4687.0: 5}, {4691.0: 5}, {4735.
0: 5}, {4737.0: 5}, {4740.0: 5}, {4750.0: 5}, {4752.0: 5}, {4797.0: 5}, {4805.0: 5},
{4808.0: 5}, {4822.0: 5}, {4840.0: 5}, {4863.0: 5}, {4867.0: 5}, {4869.0: 5}, {4870.
0: 5}, {4897.0: 5}, {4936.0: 5}, {4937.0: 5}, {4940.0: 5}, {4947.0: 5}, {4951.0: 5},
{4965.0: 5}, {5002.0: 5}, {5013.0: 5}, {5016.0: 5}, {5038.0: 5}, {5039.0: 5}, {5041.
0: 5}, {5067.0: 5}, {5083.0: 5}, {5085.0: 5}, {5097.0: 6}, {5133.0: 6}, {5156.0: 6},
{5179.0: 6}, {5188.0: 6}, {5209.0: 6}, {5234.0: 6}, {5265.0: 6}, {5281.0: 6}, {5282.
0: 6}, {5297.0: 6}, {5309.0: 6}, {5325.0: 6}, {5331.0: 6}, {5333.0: 6}, {5347.0: 6},
{5354.0: 6}, {5360.0: 6}, {5368.0: 6}, {5399.0: 6}, {5416.0: 6}, {5439.0: 6}, {5461.
0: 6}, {5479.0: 6}, {5484.0: 6}, {5496.0: 6}, {5521.0: 6}, {5523.0: 6}, {5651.0: 6},
{5813.0: 6}, {5982.0: 6}, {6557.0: 6}, {8359.0: 6}, {8463.0: 6}, {8576.0: 6}, {8745.
0: 6}, {8809.0: 6}, {9082.0: 6}, {9531.0: 6}, {9541.0: 6}, {9620.0: 6}, {9666.0: 6},
{9764.0: 6}, {9784.0: 6}, {9809.0: 6}, {9810.0: 6}, {9818.0: 6}, {9828.0: 6}, {9829.
0: 6}, {10121.0: 6}, {10122.0: 6}, {10128.0: 6}, {10646.0: 6}, {10650.0: 6}, {10675.
0: 6}, {10737.0: 6}, {10739.0: 6}, {10741.0: 6}, {10758.0: 6}, {10765.0: 6}, {10776.
0: 6}, {10800.0: 6}, {10912.0: 6}, {10955.0: 6}, {11741.0: 6}, {11763.0: 6}, {11794.
0: 6}, {11806.0: 6}, {11808.0: 6}, {11811.0: 6}, {11814.0: 6}, {11838.0: 6}, {11857.
0: 6}, {11948.0: 6}, {11963.0: 6}, {12978.0: 6}, {13088.0: 6}, {13108.0: 6}, {13200.
0: 6}, {13335.0: 7}, {13704.0: 7}, {13757.0: 7}, {13899.0: 7}, {13983.0: 7}, {14061.
0: 7}, {14118.0: 7}, {14485.0: 7}, {14849.0: 7}, {15814.0: 7}, {15837.0: 7}, {15893.
0: 7}, {15999.0: 7}, {16120.0: 7}, {16133.0: 7}, {16136.0: 7}, {16149.0: 7}, {16150.
0: 7}, {16262.0: 7}, {16415.0: 7}, {16475.0: 7}, {16508.0: 7}, {16615.0: 7}, {16624.
0: 7}, {16660.0: 7}, {16707.0: 7}, {16725.0: 7}, {16726.0: 7}, {16844.0: 7}, {16882.
0: 7}, {16942.0: 7}, {16960.0: 7}, {16963.0: 7}, {17023.0: 7}, {17083.0: 7}, {17094.
0: 7}, {17095.0: 7}, {17099.0: 7}, {17408.0: 7}, {17519.0: 7}, {17675.0: 7}, {17885.
0: 7}, {17891.0: 7}, {18169.0: 7}, {18232.0: 7}, {18529.0: 7}, {18543.0: 7}, {18553.
0: 7}, {18700.0: 7}, {18732.0: 7}, {18825.0: 7}, {18828.0: 7}, {18944.0: 7}, {18946.
0: 7}, {19016.0: 7}, {19305.0: 7}, {19582.0: 7}, {19610.0: 7}, {19676.0: 7}, {19804.
0: 7}, {19810.0: 7}, {19944.0: 7}, {20004.0: 7}, {20047.0: 7}, {20160.0: 7}, {20270.
0: 7}, {20565.0: 7}, {20577.0: 7}, {20686.0: 7}, {20710.0: 7}, {20963.0: 7}, {20976.
0: 7}, {21012.0: 7}]

In [ ]:

Recomendados

Cf. n.° 502018301-2023-399-0; 02 MAY 2023. Denuncia penal por PREVARICATO. 117p von
Cf. n.° 502018301-2023-399-0; 02 MAY 2023. Denuncia penal por PREVARICATO. 117pCf. n.° 502018301-2023-399-0; 02 MAY 2023. Denuncia penal por PREVARICATO. 117p
Cf. n.° 502018301-2023-399-0; 02 MAY 2023. Denuncia penal por PREVARICATO. 117pDylan Ezequiel LÓPEZ ENCARNACIÓN
180 views117 Folien
Police and Detective Training Schools von
Police and Detective Training SchoolsPolice and Detective Training Schools
Police and Detective Training SchoolsApplied Forensic Research Sciences
1.5K views16 Folien
Doctrina Policial PNP von
Doctrina Policial PNPDoctrina Policial PNP
Doctrina Policial PNPJuan Antonio Alvarez Manrique
5.4K views5 Folien
Rectificacion d epartida de nacimiento von
Rectificacion d epartida de nacimientoRectificacion d epartida de nacimiento
Rectificacion d epartida de nacimientoJaime David Medina Santiago
6K views3 Folien
node.js and the AR.Drone: building a real-time dashboard using socket.io von
node.js and the AR.Drone: building a real-time dashboard using socket.ionode.js and the AR.Drone: building a real-time dashboard using socket.io
node.js and the AR.Drone: building a real-time dashboard using socket.ioSteven Beeckman
15.9K views26 Folien
MySQL flexible schema and JSON for Internet of Things von
MySQL flexible schema and JSON for Internet of ThingsMySQL flexible schema and JSON for Internet of Things
MySQL flexible schema and JSON for Internet of ThingsAlexander Rubin
468 views35 Folien

Más contenido relacionado

Similar a Assignment7.pdf

CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB von
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDBCDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDBAreski Belaid
5.4K views18 Folien
The Ring programming language version 1.9 book - Part 53 of 210 von
The Ring programming language version 1.9 book - Part 53 of 210The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210Mahmoud Samir Fayed
13 views10 Folien
Is html5-ready-workshop-110727181512-phpapp02 von
Is html5-ready-workshop-110727181512-phpapp02Is html5-ready-workshop-110727181512-phpapp02
Is html5-ready-workshop-110727181512-phpapp02PL dream
343 views151 Folien
Is HTML5 Ready? (workshop) von
Is HTML5 Ready? (workshop)Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)Remy Sharp
1.4K views151 Folien
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019 von
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 201910 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019Matt Raible
191 views65 Folien
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx von
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docxVersion1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docxtienboileau
2 views28 Folien

Similar a Assignment7.pdf(20)

CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB von Areski Belaid
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDBCDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
CDR-Stats : VoIP Analytics Solution for Asterisk and FreeSWITCH with MongoDB
Areski Belaid5.4K views
The Ring programming language version 1.9 book - Part 53 of 210 von Mahmoud Samir Fayed
The Ring programming language version 1.9 book - Part 53 of 210The Ring programming language version 1.9 book - Part 53 of 210
The Ring programming language version 1.9 book - Part 53 of 210
Is html5-ready-workshop-110727181512-phpapp02 von PL dream
Is html5-ready-workshop-110727181512-phpapp02Is html5-ready-workshop-110727181512-phpapp02
Is html5-ready-workshop-110727181512-phpapp02
PL dream343 views
Is HTML5 Ready? (workshop) von Remy Sharp
Is HTML5 Ready? (workshop)Is HTML5 Ready? (workshop)
Is HTML5 Ready? (workshop)
Remy Sharp1.4K views
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019 von Matt Raible
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 201910 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
10 Excellent Ways to Secure Your Spring Boot Application - Devoxx Morocco 2019
Matt Raible191 views
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx von tienboileau
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docxVersion1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
Version1.0 StartHTML000000232 EndHTML000065057 StartFragment0000.docx
tienboileau2 views
How to Hack a Road Trip with a Webcam, a GSP and Some Fun with Node von pdeschen
How to Hack a Road Trip  with a Webcam, a GSP and Some Fun with NodeHow to Hack a Road Trip  with a Webcam, a GSP and Some Fun with Node
How to Hack a Road Trip with a Webcam, a GSP and Some Fun with Node
pdeschen1.3K views
Web+GISという視点から見たGISの方向性 von Hidenori Fujimura
Web+GISという視点から見たGISの方向性Web+GISという視点から見たGISの方向性
Web+GISという視点から見たGISの方向性
Hidenori Fujimura1.8K views
Exploring Canvas von Kevin Hoyt
Exploring CanvasExploring Canvas
Exploring Canvas
Kevin Hoyt1.7K views
The Ring programming language version 1.5.2 book - Part 52 of 181 von Mahmoud Samir Fayed
The Ring programming language version 1.5.2 book - Part 52 of 181The Ring programming language version 1.5.2 book - Part 52 of 181
The Ring programming language version 1.5.2 book - Part 52 of 181
Bonnes pratiques de développement avec Node js von Francois Zaninotto
Bonnes pratiques de développement avec Node jsBonnes pratiques de développement avec Node js
Bonnes pratiques de développement avec Node js
Francois Zaninotto5.7K views
The Ring programming language version 1.4.1 book - Part 13 of 31 von Mahmoud Samir Fayed
The Ring programming language version 1.4.1 book - Part 13 of 31The Ring programming language version 1.4.1 book - Part 13 of 31
The Ring programming language version 1.4.1 book - Part 13 of 31
Detection of errors and potential vulnerabilities in C and C++ code using the... von Andrey Karpov
Detection of errors and potential vulnerabilities in C and C++ code using the...Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...
Andrey Karpov95 views
The Ring programming language version 1.10 book - Part 54 of 212 von Mahmoud Samir Fayed
The Ring programming language version 1.10 book - Part 54 of 212The Ring programming language version 1.10 book - Part 54 of 212
The Ring programming language version 1.10 book - Part 54 of 212
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】 von tsuchimon
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】
AWS IoTで家庭内IoTをやってみた【JAWS DAYS 2016】
tsuchimon3.3K views
Books von flaglio
BooksBooks
Books
flaglio1.2K views
groovy databases von Paul King
groovy databasesgroovy databases
groovy databases
Paul King5.3K views
How to build a html5 websites.v1 von Bitla Software
How to build a html5 websites.v1How to build a html5 websites.v1
How to build a html5 websites.v1
Bitla Software2.4K views
The Ring programming language version 1.7 book - Part 48 of 196 von Mahmoud Samir Fayed
The Ring programming language version 1.7 book - Part 48 of 196The Ring programming language version 1.7 book - Part 48 of 196
The Ring programming language version 1.7 book - Part 48 of 196

Más de dash41

Assignment 6.3.pdf von
Assignment 6.3.pdfAssignment 6.3.pdf
Assignment 6.3.pdfdash41
16 views2 Folien
Assignment 6.2b.pdf von
Assignment 6.2b.pdfAssignment 6.2b.pdf
Assignment 6.2b.pdfdash41
9 views4 Folien
Assignment 6.2a.pdf von
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdfdash41
10 views4 Folien
Assignment 6.1.pdf von
Assignment 6.1.pdfAssignment 6.1.pdf
Assignment 6.1.pdfdash41
14 views4 Folien
Assignment 5.3.pdf von
Assignment 5.3.pdfAssignment 5.3.pdf
Assignment 5.3.pdfdash41
19 views4 Folien
Assignment 5.2.pdf von
Assignment 5.2.pdfAssignment 5.2.pdf
Assignment 5.2.pdfdash41
4 views7 Folien

Más de dash41(11)

Assignment 6.3.pdf von dash41
Assignment 6.3.pdfAssignment 6.3.pdf
Assignment 6.3.pdf
dash4116 views
Assignment 6.2b.pdf von dash41
Assignment 6.2b.pdfAssignment 6.2b.pdf
Assignment 6.2b.pdf
dash419 views
Assignment 6.2a.pdf von dash41
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdf
dash4110 views
Assignment 6.1.pdf von dash41
Assignment 6.1.pdfAssignment 6.1.pdf
Assignment 6.1.pdf
dash4114 views
Assignment 5.3.pdf von dash41
Assignment 5.3.pdfAssignment 5.3.pdf
Assignment 5.3.pdf
dash4119 views
Assignment 5.2.pdf von dash41
Assignment 5.2.pdfAssignment 5.2.pdf
Assignment 5.2.pdf
dash414 views
Assignment 5.1.pdf von dash41
Assignment 5.1.pdfAssignment 5.1.pdf
Assignment 5.1.pdf
dash413 views
Assignment 4.pdf von dash41
Assignment 4.pdfAssignment 4.pdf
Assignment 4.pdf
dash4129 views
Assignment 3.pdf von dash41
Assignment 3.pdfAssignment 3.pdf
Assignment 3.pdf
dash4132 views
rdbms.pdf von dash41
rdbms.pdfrdbms.pdf
rdbms.pdf
dash412 views
documentsdb.pdf von dash41
documentsdb.pdfdocumentsdb.pdf
documentsdb.pdf
dash412 views

Último

Oral presentation.pdf von
Oral presentation.pdfOral presentation.pdf
Oral presentation.pdfreemalmazroui8
5 views10 Folien
DGST Methodology Presentation.pdf von
DGST Methodology Presentation.pdfDGST Methodology Presentation.pdf
DGST Methodology Presentation.pdfmaddierlegum
7 views9 Folien
Customer Data Cleansing Project.pptx von
Customer Data Cleansing Project.pptxCustomer Data Cleansing Project.pptx
Customer Data Cleansing Project.pptxNat O
6 views23 Folien
DGIQ East 2023 AI Ethics SIG von
DGIQ East 2023 AI Ethics SIGDGIQ East 2023 AI Ethics SIG
DGIQ East 2023 AI Ethics SIGKaren Lopez
5 views7 Folien
Pydata Global 2023 - How can a learnt model unlearn something von
Pydata Global 2023 - How can a learnt model unlearn somethingPydata Global 2023 - How can a learnt model unlearn something
Pydata Global 2023 - How can a learnt model unlearn somethingSARADINDU SENGUPTA
8 views13 Folien
Inawsidom - Data Journey von
Inawsidom - Data JourneyInawsidom - Data Journey
Inawsidom - Data JourneyPhilipBasford
9 views38 Folien

Último(20)

DGST Methodology Presentation.pdf von maddierlegum
DGST Methodology Presentation.pdfDGST Methodology Presentation.pdf
DGST Methodology Presentation.pdf
maddierlegum7 views
Customer Data Cleansing Project.pptx von Nat O
Customer Data Cleansing Project.pptxCustomer Data Cleansing Project.pptx
Customer Data Cleansing Project.pptx
Nat O6 views
DGIQ East 2023 AI Ethics SIG von Karen Lopez
DGIQ East 2023 AI Ethics SIGDGIQ East 2023 AI Ethics SIG
DGIQ East 2023 AI Ethics SIG
Karen Lopez5 views
Pydata Global 2023 - How can a learnt model unlearn something von SARADINDU SENGUPTA
Pydata Global 2023 - How can a learnt model unlearn somethingPydata Global 2023 - How can a learnt model unlearn something
Pydata Global 2023 - How can a learnt model unlearn something
Listed Instruments Survey 2022.pptx von secretariat4
Listed Instruments Survey  2022.pptxListed Instruments Survey  2022.pptx
Listed Instruments Survey 2022.pptx
secretariat4130 views
Underfunded.pptx von vgarcia19
Underfunded.pptxUnderfunded.pptx
Underfunded.pptx
vgarcia1915 views
Product Research sample.pdf von AllenSingson
Product Research sample.pdfProduct Research sample.pdf
Product Research sample.pdf
AllenSingson35 views
Best Home Security Systems.pptx von mogalang
Best Home Security Systems.pptxBest Home Security Systems.pptx
Best Home Security Systems.pptx
mogalang9 views
PRIVACY AWRE PERSONAL DATA STORAGE von antony420421
PRIVACY AWRE PERSONAL DATA STORAGEPRIVACY AWRE PERSONAL DATA STORAGE
PRIVACY AWRE PERSONAL DATA STORAGE
antony4204218 views
Running PostgreSQL in a Kubernetes cluster: CloudNativePG von Nick Ivanov
Running PostgreSQL in a Kubernetes cluster: CloudNativePGRunning PostgreSQL in a Kubernetes cluster: CloudNativePG
Running PostgreSQL in a Kubernetes cluster: CloudNativePG
Nick Ivanov7 views
Data Journeys Hard Talk workshop final.pptx von info828217
Data Journeys Hard Talk workshop final.pptxData Journeys Hard Talk workshop final.pptx
Data Journeys Hard Talk workshop final.pptx
info82821711 views

Assignment7.pdf

  • 1. Assignment 7.1.a pyarrow.Table airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao: string, callsign: string, country: string, active: bool> child 0, airline_id: int64 child 1, name: string child 2, alias: string child 3, iata: string In [93]: import os import json from pathlib import Path import gzip import hashlib import shutil import pandas as pd import pygeohash import s3fs endpoint_url='https://storage.budsc.midwest-datascience.com' current_dir = Path(os.getcwd()).absolute() results_dir = current_dir.joinpath('results') if results_dir.exists(): shutil.rmtree(results_dir) results_dir.mkdir(parents=True, exist_ok=True) def read_jsonl_data(): s3 = s3fs.S3FileSystem( anon=True, client_kwargs={ 'endpoint_url': endpoint_url } ) src_data_path = 'data/processed/openflights/routes.jsonl.gz' with s3.open(src_data_path, 'rb') as f_gz: with gzip.open(f_gz, 'rb') as f: records = [json.loads(line) for line in f.readlines()] return records In [94]: from pyarrow.json import read_json import pyarrow.parquet as pq def create_parquet_dataset(): src_data_path = 'data/processed/openflights/routes.jsonl.gz' parquet_output_path = results_dir.joinpath('routes.parquet') s3 = s3fs.S3FileSystem( anon=True, client_kwargs={ 'endpoint_url': endpoint_url } ) with s3.open(src_data_path, 'rb') as f_gz: with gzip.open(f_gz, 'rb') as f: ## TODO: Use Apache Arrow to create Parquet table and save the dataset table = read_json(f) print(table) pq.write_table(table, parquet_output_path, compression='none') create_parquet_dataset()
  • 2. child 4, icao: string child 5, callsign: string child 6, country: string child 7, active: bool src_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti mezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string dst_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti mezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string codeshare: bool equipment: list<item: string> child 0, item: string ['codeshare', 'equipment', 'airline.airline_id', 'airline.name', 'airline.alias', 'a irline.iata', 'airline.icao', 'airline.callsign', 'airline.country', 'airline.activ e', 'src_airport.airport_id', 'src_airport.name', 'src_airport.city', 'src_airport.c ountry', 'src_airport.iata', 'src_airport.icao', 'src_airport.latitude', 'src_airpor t.longitude', 'src_airport.altitude', 'src_airport.timezone', 'src_airport.dst', 'sr c_airport.tz_id', 'src_airport.type', 'src_airport.source', 'dst_airport.airport_i d', 'dst_airport.name', 'dst_airport.city', 'dst_airport.country', 'dst_airport.iat a', 'dst_airport.icao', 'dst_airport.latitude', 'dst_airport.longitude', 'dst_airpor t.altitude', 'dst_airport.timezone', 'dst_airport.dst', 'dst_airport.tz_id', 'dst_ai rport.type', 'dst_airport.source'] In [95]: parquet_output_path = results_dir.joinpath('routes.parquet') pq = pd.read_parquet(parquet_output_path, engine='fastparquet') print(list(pq.columns.values)) In [96]: partitions = ( ('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'), ('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'), ('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'), ('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z') ) In [97]: partitions_keys = ( 'A', 'B', 'C-D', 'E-F', 'G-H', 'I-J', 'K-L', 'M', 'N', 'O-P', 'Q-R', 'S-T',
  • 3. {'A': ('A', 'A'), 'B': ('B', 'B'), 'C-D': ('C', 'D'), 'E-F': ('E', 'F'), 'G-H': ('G', 'H'), 'I-J': ('I', 'J'), 'K-L': ('K', 'L'), 'M': ('M', 'M'), 'N': ('N', 'N'), 'O-P': ('O', 'P'), 'Q-R': ('Q', 'R'), 'S-T': ('S', 'T'), 'U': ('U', 'U'), 'V': ('V', 'V'), 'W-X': ('W', 'X'), 'Y-Z': ('Y', 'Z')} Assignment 7.1.b codeshare equipment airline.airline_id airline.name airline.alias airline.iata airline.icao airline. 0 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 1 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 2 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 3 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 4 0.0 [CR2] 410.0 Aerocondor ANA All Nippon Airways 2B ARD AEROC 5 rows × 41 columns 'U', 'V', 'W-X', 'Y-Z' ) In [98]: parts_k_v = dict(zip(partitions_keys, partitions)) print(parts_k_v) In [99]: def get_key(val): for key, value in parts_k_v.items(): if val in value: return key return "0" In [100… pq['key'] = pq['src_airport.iata'] + pq['dst_airport.iata'] + pq['airline.iata'] pq['partition_value'] = pq['key'].str[:1] pq['kv_key'] = pq.apply(lambda x: get_key(x.partition_value), axis=1) In [101… # remove invalid keys pq = pq[pq.kv_key != "0"].astype('float32', errors='ignore') In [102… pq.head() Out[102… In [104… import pyarrow as pa import pyarrow.parquet as parpq pq_tab = pa.Table.from_pandas(pq) parpq.write_to_dataset( pq_tab, root_path=results_dir.joinpath('kv'), partition_cols=['kv_key'], )
  • 4. Assignment 7.1.c {'west': 'c21g6s0rs4c7', 'central': '9z7dnebnj8kb', 'east': 'dqby34cjw922'} Assignment 7.1.d In [105… import hashlib def hash_key(key): m = hashlib.sha256() m.update(str(key).encode('utf-8')) return m.hexdigest() In [106… pq['key'] = pq['src_airport.iata']+pq['dst_airport.iata']+pq['airline.iata'] pq['hashed'] = pq.apply(lambda x: hash_key(x.key), axis=1) pq['hash_key'] = pq['hashed'].str[:1] In [107… pq_tab1 = pa.Table.from_pandas(pq) parpq.write_to_dataset( pq_tab1, root_path=results_dir.joinpath('hash'), partition_cols=['hash_key'], ) In [109… #get hash for datacenters datacenters = {} datacenters['west'] = pygeohash.encode(45.5945645, -121.1786823) datacenters['central'] = pygeohash.encode(41.1544433, -96.0422378) datacenters['east'] = pygeohash.encode(39.08344, -77.6497145) print(datacenters) In [110… def closest_datacenter(latitude, longitude): geohash = pygeohash.encode(latitude, longitude) dist_dict = {} closest_datacenter = '' last_distance = None for key, value in datacenters.items(): dist = pygeohash.geohash_approximate_distance(str(geohash), str(value)) dist_dict[key] = dist if (last_distance == None) or (dist < last_distance): closest_datacenter = key last_distance = dist return closest_datacenter In [113… pq['datacenter'] = pq[['src_airport.latitude', 'src_airport.longitude']].apply(lambd In [114… pq_tab2 = pa.Table.from_pandas(pq) parpq.write_to_dataset( pq_tab2, root_path=results_dir.joinpath('geo'), partition_cols=['datacenter'], )
  • 5. 0 410.0 1 410.0 2 410.0 3 410.0 4 410.0 Name: airline.airline_id, dtype: float32 [{-1.0: 1}, {10.0: 1}, {21.0: 1}, {24.0: 1}, {28.0: 1}, {29.0: 1}, {32.0: 1}, {35.0: 1}, {42.0: 1}, {43.0: 1}, {55.0: 1}, {68.0: 1}, {83.0: 1}, {90.0: 1}, {96.0: 1}, {10 6.0: 1}, {109.0: 1}, {116.0: 1}, {125.0: 1}, {130.0: 1}, {132.0: 1}, {137.0: 1}, {13 9.0: 1}, {146.0: 1}, {153.0: 1}, {179.0: 1}, {197.0: 1}, {214.0: 1}, {218.0: 1}, {22 0.0: 1}, {221.0: 1}, {225.0: 1}, {231.0: 1}, {240.0: 1}, {241.0: 1}, {242.0: 1}, {24 6.0: 1}, {312.0: 1}, {316.0: 1}, {319.0: 1}, {321.0: 1}, {324.0: 1}, {328.0: 1}, {32 9.0: 1}, {330.0: 1}, {333.0: 1}, {336.0: 1}, {338.0: 1}, {341.0: 1}, {345.0: 1}, {38 6.0: 1}, {397.0: 1}, {410.0: 1}, {412.0: 1}, {426.0: 1}, {439.0: 1}, {442.0: 1}, {46 2.0: 1}, {470.0: 1}, {476.0: 1}, {477.0: 1}, {491.0: 1}, {502.0: 1}, {503.0: 1}, {50 8.0: 1}, {515.0: 1}, {524.0: 1}, {543.0: 1}, {563.0: 1}, {567.0: 1}, {569.0: 1}, {57 6.0: 1}, {595.0: 1}, {596.0: 1}, {603.0: 1}, {608.0: 1}, {622.0: 1}, {641.0: 1}, {68 3.0: 1}, {690.0: 2}, {692.0: 2}, {751.0: 2}, {753.0: 2}, {794.0: 2}, {807.0: 2}, {83 7.0: 2}, {879.0: 2}, {881.0: 2}, {882.0: 2}, {896.0: 2}, {897.0: 2}, {921.0: 2}, {97 0.0: 2}, {995.0: 2}, {998.0: 2}, {1006.0: 2}, {1008.0: 2}, {1034.0: 2}, {1048.0: 2}, {1057.0: 2}, {1066.0: 2}, {1073.0: 2}, {1109.0: 2}, {1173.0: 2}, {1191.0: 2}, {1203. 0: 2}, {1206.0: 2}, {1230.0: 2}, {1266.0: 2}, {1287.0: 2}, {1290.0: 2}, {1299.0: 2}, {1308.0: 2}, {1316.0: 2}, {1317.0: 2}, {1338.0: 2}, {1340.0: 2}, {1355.0: 2}, {1359. 0: 2}, {1392.0: 2}, {1401.0: 2}, {1403.0: 2}, {1422.0: 2}, {1434.0: 2}, {1441.0: 2}, {1463.0: 2}, {1469.0: 2}, {1472.0: 2}, {1478.0: 2}, {1492.0: 2}, {1500.0: 2}, {1508. 0: 2}, {1531.0: 2}, {1539.0: 2}, {1548.0: 2}, {1581.0: 2}, {1611.0: 2}, {1623.0: 2}, {1629.0: 2}, {1654.0: 2}, {1663.0: 2}, {1669.0: 2}, {1680.0: 2}, {1682.0: 2}, {1683. 0: 2}, {1729.0: 2}, {1750.0: 2}, {1756.0: 2}, {1758.0: 2}, {1767.0: 2}, {1769.0: 2}, {1775.0: 2}, {1790.0: 2}, {1792.0: 2}, {1829.0: 2}, {1844.0: 2}, {1868.0: 2}, {1886. 0: 2}, {1889.0: 3}, {1908.0: 3}, {1909.0: 3}, {1925.0: 3}, {1936.0: 3}, {1942.0: 3}, {1943.0: 3}, {1946.0: 3}, {1954.0: 3}, {1966.0: 3}, {2009.0: 3}, {2056.0: 3}, {2058. 0: 3}, {2091.0: 3}, {2094.0: 3}, {2104.0: 3}, {2117.0: 3}, {2143.0: 3}, {2150.0: 3}, {2183.0: 3}, {2193.0: 3}, {2217.0: 3}, {2218.0: 3}, {2220.0: 3}, {2222.0: 3}, {2226. 0: 3}, {2245.0: 3}, {2260.0: 3}, {2264.0: 3}, {2293.0: 3}, {2297.0: 3}, {2324.0: 3}, {2350.0: 3}, {2353.0: 3}, {2354.0: 3}, {2395.0: 3}, {2409.0: 3}, {2417.0: 3}, {2418. 0: 3}, {2419.0: 3}, {2420.0: 3}, {2421.0: 3}, {2439.0: 3}, {2468.0: 3}, {2520.0: 3}, {2524.0: 3}, {2538.0: 3}, {2541.0: 3}, {2547.0: 3}, {2548.0: 3}, {2575.0: 3}, {2585. 0: 3}, {2607.0: 3}, {2622.0: 3}, {2638.0: 3}, {2660.0: 3}, {2681.0: 3}, {2682.0: 3}, {2684.0: 3}, {2688.0: 3}, {2692.0: 3}, {2731.0: 3}, {2748.0: 3}, {2750.0: 3}, {2757. 0: 3}, {2765.0: 3}, {2773.0: 3}, {2774.0: 3}, {2822.0: 3}, {2825.0: 3}, {2826.0: 3}, {2835.0: 3}, {2850.0: 3}, {2857.0: 3}, {2881.0: 3}, {2896.0: 3}, {2916.0: 3}, {2922. 0: 3}, {2923.0: 3}, {2942.0: 4}, {2951.0: 4}, {2954.0: 4}, {2987.0: 4}, {2989.0: 4}, In [128… pq['airline.airline_id'].head() Out[128… In [133… def balance_partitions(keys, num_partitions): ac = keys.cumsum() #sum of the entire array partsum = ac[-1]//num_partitions #generates the cumulative sums of each part cum_part_sums = np.array(range(1,p))*partsum #finds the indices inds = np.searchsorted(ac,cum_part_sums) #split into approximately equal-sum arrays parts = np.split(arr,inds) return parts In [134… keys = list(pq['airline.airline_id']) num_partitions=7 In [135… print(balance_partitions(keys, num_partitions))
  • 6. {2990.0: 4}, {2993.0: 4}, {2994.0: 4}, {3000.0: 4}, {3021.0: 4}, {3026.0: 4}, {3029. 0: 4}, {3052.0: 4}, {3081.0: 4}, {3090.0: 4}, {3097.0: 4}, {3123.0: 4}, {3126.0: 4}, {3148.0: 4}, {3163.0: 4}, {3179.0: 4}, {3197.0: 4}, {3200.0: 4}, {3201.0: 4}, {3210. 0: 4}, {3233.0: 4}, {3251.0: 4}, {3258.0: 4}, {3287.0: 4}, {3290.0: 4}, {3320.0: 4}, {3329.0: 4}, {3342.0: 4}, {3354.0: 4}, {3370.0: 4}, {3378.0: 4}, {3386.0: 4}, {3391. 0: 4}, {3392.0: 4}, {3393.0: 4}, {3432.0: 4}, {3437.0: 4}, {3463.0: 4}, {3490.0: 4}, {3498.0: 4}, {3534.0: 4}, {3539.0: 4}, {3545.0: 4}, {3547.0: 4}, {3574.0: 4}, {3589. 0: 4}, {3613.0: 4}, {3618.0: 4}, {3637.0: 4}, {3652.0: 4}, {3661.0: 4}, {3674.0: 4}, {3721.0: 4}, {3734.0: 4}, {3737.0: 4}, {3740.0: 4}, {3754.0: 4}, {3764.0: 4}, {3776. 0: 4}, {3778.0: 4}, {3781.0: 4}, {3783.0: 4}, {3788.0: 4}, {3805.0: 4}, {3811.0: 4}, {3826.0: 4}, {3834.0: 4}, {3835.0: 4}, {3850.0: 4}, {3856.0: 4}, {3857.0: 4}, {3865. 0: 4}, {3871.0: 4}, {3926.0: 4}, {3935.0: 5}, {3952.0: 5}, {3969.0: 5}, {3976.0: 5}, {4021.0: 5}, {4026.0: 5}, {4031.0: 5}, {4044.0: 5}, {4066.0: 5}, {4089.0: 5}, {4091. 0: 5}, {4165.0: 5}, {4178.0: 5}, {4234.0: 5}, {4248.0: 5}, {4255.0: 5}, {4259.0: 5}, {4292.0: 5}, {4296.0: 5}, {4304.0: 5}, {4305.0: 5}, {4311.0: 5}, {4319.0: 5}, {4329. 0: 5}, {4335.0: 5}, {4349.0: 5}, {4356.0: 5}, {4375.0: 5}, {4388.0: 5}, {4429.0: 5}, {4435.0: 5}, {4436.0: 5}, {4438.0: 5}, {4454.0: 5}, {4475.0: 5}, {4496.0: 5}, {4513. 0: 5}, {4521.0: 5}, {4533.0: 5}, {4547.0: 5}, {4550.0: 5}, {4559.0: 5}, {4573.0: 5}, {4599.0: 5}, {4608.0: 5}, {4609.0: 5}, {4611.0: 5}, {4687.0: 5}, {4691.0: 5}, {4735. 0: 5}, {4737.0: 5}, {4740.0: 5}, {4750.0: 5}, {4752.0: 5}, {4797.0: 5}, {4805.0: 5}, {4808.0: 5}, {4822.0: 5}, {4840.0: 5}, {4863.0: 5}, {4867.0: 5}, {4869.0: 5}, {4870. 0: 5}, {4897.0: 5}, {4936.0: 5}, {4937.0: 5}, {4940.0: 5}, {4947.0: 5}, {4951.0: 5}, {4965.0: 5}, {5002.0: 5}, {5013.0: 5}, {5016.0: 5}, {5038.0: 5}, {5039.0: 5}, {5041. 0: 5}, {5067.0: 5}, {5083.0: 5}, {5085.0: 5}, {5097.0: 6}, {5133.0: 6}, {5156.0: 6}, {5179.0: 6}, {5188.0: 6}, {5209.0: 6}, {5234.0: 6}, {5265.0: 6}, {5281.0: 6}, {5282. 0: 6}, {5297.0: 6}, {5309.0: 6}, {5325.0: 6}, {5331.0: 6}, {5333.0: 6}, {5347.0: 6}, {5354.0: 6}, {5360.0: 6}, {5368.0: 6}, {5399.0: 6}, {5416.0: 6}, {5439.0: 6}, {5461. 0: 6}, {5479.0: 6}, {5484.0: 6}, {5496.0: 6}, {5521.0: 6}, {5523.0: 6}, {5651.0: 6}, {5813.0: 6}, {5982.0: 6}, {6557.0: 6}, {8359.0: 6}, {8463.0: 6}, {8576.0: 6}, {8745. 0: 6}, {8809.0: 6}, {9082.0: 6}, {9531.0: 6}, {9541.0: 6}, {9620.0: 6}, {9666.0: 6}, {9764.0: 6}, {9784.0: 6}, {9809.0: 6}, {9810.0: 6}, {9818.0: 6}, {9828.0: 6}, {9829. 0: 6}, {10121.0: 6}, {10122.0: 6}, {10128.0: 6}, {10646.0: 6}, {10650.0: 6}, {10675. 0: 6}, {10737.0: 6}, {10739.0: 6}, {10741.0: 6}, {10758.0: 6}, {10765.0: 6}, {10776. 0: 6}, {10800.0: 6}, {10912.0: 6}, {10955.0: 6}, {11741.0: 6}, {11763.0: 6}, {11794. 0: 6}, {11806.0: 6}, {11808.0: 6}, {11811.0: 6}, {11814.0: 6}, {11838.0: 6}, {11857. 0: 6}, {11948.0: 6}, {11963.0: 6}, {12978.0: 6}, {13088.0: 6}, {13108.0: 6}, {13200. 0: 6}, {13335.0: 7}, {13704.0: 7}, {13757.0: 7}, {13899.0: 7}, {13983.0: 7}, {14061. 0: 7}, {14118.0: 7}, {14485.0: 7}, {14849.0: 7}, {15814.0: 7}, {15837.0: 7}, {15893. 0: 7}, {15999.0: 7}, {16120.0: 7}, {16133.0: 7}, {16136.0: 7}, {16149.0: 7}, {16150. 0: 7}, {16262.0: 7}, {16415.0: 7}, {16475.0: 7}, {16508.0: 7}, {16615.0: 7}, {16624. 0: 7}, {16660.0: 7}, {16707.0: 7}, {16725.0: 7}, {16726.0: 7}, {16844.0: 7}, {16882. 0: 7}, {16942.0: 7}, {16960.0: 7}, {16963.0: 7}, {17023.0: 7}, {17083.0: 7}, {17094. 0: 7}, {17095.0: 7}, {17099.0: 7}, {17408.0: 7}, {17519.0: 7}, {17675.0: 7}, {17885. 0: 7}, {17891.0: 7}, {18169.0: 7}, {18232.0: 7}, {18529.0: 7}, {18543.0: 7}, {18553. 0: 7}, {18700.0: 7}, {18732.0: 7}, {18825.0: 7}, {18828.0: 7}, {18944.0: 7}, {18946. 0: 7}, {19016.0: 7}, {19305.0: 7}, {19582.0: 7}, {19610.0: 7}, {19676.0: 7}, {19804. 0: 7}, {19810.0: 7}, {19944.0: 7}, {20004.0: 7}, {20047.0: 7}, {20160.0: 7}, {20270. 0: 7}, {20565.0: 7}, {20577.0: 7}, {20686.0: 7}, {20710.0: 7}, {20963.0: 7}, {20976. 0: 7}, {21012.0: 7}] In [ ]: