Más contenido relacionado
Similar a Assignment7.pdf
Similar a Assignment7.pdf(20)
Assignment7.pdf
- 1. Assignment 7.1.a
pyarrow.Table
airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao:
string, callsign: string, country: string, active: bool>
child 0, airline_id: int64
child 1, name: string
child 2, alias: string
child 3, iata: string
In [93]: import os
import json
from pathlib import Path
import gzip
import hashlib
import shutil
import pandas as pd
import pygeohash
import s3fs
endpoint_url='https://storage.budsc.midwest-datascience.com'
current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
if results_dir.exists():
shutil.rmtree(results_dir)
results_dir.mkdir(parents=True, exist_ok=True)
def read_jsonl_data():
s3 = s3fs.S3FileSystem(
anon=True,
client_kwargs={
'endpoint_url': endpoint_url
}
)
src_data_path = 'data/processed/openflights/routes.jsonl.gz'
with s3.open(src_data_path, 'rb') as f_gz:
with gzip.open(f_gz, 'rb') as f:
records = [json.loads(line) for line in f.readlines()]
return records
In [94]: from pyarrow.json import read_json
import pyarrow.parquet as pq
def create_parquet_dataset():
src_data_path = 'data/processed/openflights/routes.jsonl.gz'
parquet_output_path = results_dir.joinpath('routes.parquet')
s3 = s3fs.S3FileSystem(
anon=True,
client_kwargs={
'endpoint_url': endpoint_url
}
)
with s3.open(src_data_path, 'rb') as f_gz:
with gzip.open(f_gz, 'rb') as f:
## TODO: Use Apache Arrow to create Parquet table and save the dataset
table = read_json(f)
print(table)
pq.write_table(table, parquet_output_path, compression='none')
create_parquet_dataset()
- 2. child 4, icao: string
child 5, callsign: string
child 6, country: string
child 7, active: bool
src_airport: struct<airport_id: int64, name: string, city: string, country: string,
iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>
child 0, airport_id: int64
child 1, name: string
child 2, city: string
child 3, country: string
child 4, iata: string
child 5, icao: string
child 6, latitude: double
child 7, longitude: double
child 8, altitude: int64
child 9, timezone: double
child 10, dst: string
child 11, tz_id: string
child 12, type: string
child 13, source: string
dst_airport: struct<airport_id: int64, name: string, city: string, country: string,
iata: string, icao: string, latitude: double, longitude: double, altitude: int64, ti
mezone: double, dst: string, tz_id: string, type: string, source: string>
child 0, airport_id: int64
child 1, name: string
child 2, city: string
child 3, country: string
child 4, iata: string
child 5, icao: string
child 6, latitude: double
child 7, longitude: double
child 8, altitude: int64
child 9, timezone: double
child 10, dst: string
child 11, tz_id: string
child 12, type: string
child 13, source: string
codeshare: bool
equipment: list<item: string>
child 0, item: string
['codeshare', 'equipment', 'airline.airline_id', 'airline.name', 'airline.alias', 'a
irline.iata', 'airline.icao', 'airline.callsign', 'airline.country', 'airline.activ
e', 'src_airport.airport_id', 'src_airport.name', 'src_airport.city', 'src_airport.c
ountry', 'src_airport.iata', 'src_airport.icao', 'src_airport.latitude', 'src_airpor
t.longitude', 'src_airport.altitude', 'src_airport.timezone', 'src_airport.dst', 'sr
c_airport.tz_id', 'src_airport.type', 'src_airport.source', 'dst_airport.airport_i
d', 'dst_airport.name', 'dst_airport.city', 'dst_airport.country', 'dst_airport.iat
a', 'dst_airport.icao', 'dst_airport.latitude', 'dst_airport.longitude', 'dst_airpor
t.altitude', 'dst_airport.timezone', 'dst_airport.dst', 'dst_airport.tz_id', 'dst_ai
rport.type', 'dst_airport.source']
In [95]: parquet_output_path = results_dir.joinpath('routes.parquet')
pq = pd.read_parquet(parquet_output_path, engine='fastparquet')
print(list(pq.columns.values))
In [96]: partitions = (
('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'),
('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'),
('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'),
('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z')
)
In [97]: partitions_keys = (
'A', 'B', 'C-D', 'E-F',
'G-H', 'I-J', 'K-L', 'M',
'N', 'O-P', 'Q-R', 'S-T',
- 3. {'A': ('A', 'A'), 'B': ('B', 'B'), 'C-D': ('C', 'D'), 'E-F': ('E', 'F'), 'G-H':
('G', 'H'), 'I-J': ('I', 'J'), 'K-L': ('K', 'L'), 'M': ('M', 'M'), 'N': ('N', 'N'),
'O-P': ('O', 'P'), 'Q-R': ('Q', 'R'), 'S-T': ('S', 'T'), 'U': ('U', 'U'), 'V': ('V',
'V'), 'W-X': ('W', 'X'), 'Y-Z': ('Y', 'Z')}
Assignment 7.1.b
codeshare equipment airline.airline_id airline.name airline.alias airline.iata airline.icao airline.
0 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
1 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
2 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
3 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
4 0.0 [CR2] 410.0 Aerocondor
ANA All
Nippon
Airways
2B ARD AEROC
5 rows × 41 columns
'U', 'V', 'W-X', 'Y-Z'
)
In [98]: parts_k_v = dict(zip(partitions_keys, partitions))
print(parts_k_v)
In [99]: def get_key(val):
for key, value in parts_k_v.items():
if val in value:
return key
return "0"
In [100… pq['key'] = pq['src_airport.iata'] + pq['dst_airport.iata'] + pq['airline.iata']
pq['partition_value'] = pq['key'].str[:1]
pq['kv_key'] = pq.apply(lambda x: get_key(x.partition_value), axis=1)
In [101… # remove invalid keys
pq = pq[pq.kv_key != "0"].astype('float32', errors='ignore')
In [102… pq.head()
Out[102…
In [104… import pyarrow as pa
import pyarrow.parquet as parpq
pq_tab = pa.Table.from_pandas(pq)
parpq.write_to_dataset(
pq_tab,
root_path=results_dir.joinpath('kv'),
partition_cols=['kv_key'],
)
- 4. Assignment 7.1.c
{'west': 'c21g6s0rs4c7', 'central': '9z7dnebnj8kb', 'east': 'dqby34cjw922'}
Assignment 7.1.d
In [105… import hashlib
def hash_key(key):
m = hashlib.sha256()
m.update(str(key).encode('utf-8'))
return m.hexdigest()
In [106… pq['key'] = pq['src_airport.iata']+pq['dst_airport.iata']+pq['airline.iata']
pq['hashed'] = pq.apply(lambda x: hash_key(x.key), axis=1)
pq['hash_key'] = pq['hashed'].str[:1]
In [107… pq_tab1 = pa.Table.from_pandas(pq)
parpq.write_to_dataset(
pq_tab1,
root_path=results_dir.joinpath('hash'),
partition_cols=['hash_key'],
)
In [109… #get hash for datacenters
datacenters = {}
datacenters['west'] = pygeohash.encode(45.5945645, -121.1786823)
datacenters['central'] = pygeohash.encode(41.1544433, -96.0422378)
datacenters['east'] = pygeohash.encode(39.08344, -77.6497145)
print(datacenters)
In [110… def closest_datacenter(latitude, longitude):
geohash = pygeohash.encode(latitude, longitude)
dist_dict = {}
closest_datacenter = ''
last_distance = None
for key, value in datacenters.items():
dist = pygeohash.geohash_approximate_distance(str(geohash), str(value))
dist_dict[key] = dist
if (last_distance == None) or (dist < last_distance):
closest_datacenter = key
last_distance = dist
return closest_datacenter
In [113… pq['datacenter'] = pq[['src_airport.latitude', 'src_airport.longitude']].apply(lambd
In [114… pq_tab2 = pa.Table.from_pandas(pq)
parpq.write_to_dataset(
pq_tab2,
root_path=results_dir.joinpath('geo'),
partition_cols=['datacenter'],
)
- 5. 0 410.0
1 410.0
2 410.0
3 410.0
4 410.0
Name: airline.airline_id, dtype: float32
[{-1.0: 1}, {10.0: 1}, {21.0: 1}, {24.0: 1}, {28.0: 1}, {29.0: 1}, {32.0: 1}, {35.0:
1}, {42.0: 1}, {43.0: 1}, {55.0: 1}, {68.0: 1}, {83.0: 1}, {90.0: 1}, {96.0: 1}, {10
6.0: 1}, {109.0: 1}, {116.0: 1}, {125.0: 1}, {130.0: 1}, {132.0: 1}, {137.0: 1}, {13
9.0: 1}, {146.0: 1}, {153.0: 1}, {179.0: 1}, {197.0: 1}, {214.0: 1}, {218.0: 1}, {22
0.0: 1}, {221.0: 1}, {225.0: 1}, {231.0: 1}, {240.0: 1}, {241.0: 1}, {242.0: 1}, {24
6.0: 1}, {312.0: 1}, {316.0: 1}, {319.0: 1}, {321.0: 1}, {324.0: 1}, {328.0: 1}, {32
9.0: 1}, {330.0: 1}, {333.0: 1}, {336.0: 1}, {338.0: 1}, {341.0: 1}, {345.0: 1}, {38
6.0: 1}, {397.0: 1}, {410.0: 1}, {412.0: 1}, {426.0: 1}, {439.0: 1}, {442.0: 1}, {46
2.0: 1}, {470.0: 1}, {476.0: 1}, {477.0: 1}, {491.0: 1}, {502.0: 1}, {503.0: 1}, {50
8.0: 1}, {515.0: 1}, {524.0: 1}, {543.0: 1}, {563.0: 1}, {567.0: 1}, {569.0: 1}, {57
6.0: 1}, {595.0: 1}, {596.0: 1}, {603.0: 1}, {608.0: 1}, {622.0: 1}, {641.0: 1}, {68
3.0: 1}, {690.0: 2}, {692.0: 2}, {751.0: 2}, {753.0: 2}, {794.0: 2}, {807.0: 2}, {83
7.0: 2}, {879.0: 2}, {881.0: 2}, {882.0: 2}, {896.0: 2}, {897.0: 2}, {921.0: 2}, {97
0.0: 2}, {995.0: 2}, {998.0: 2}, {1006.0: 2}, {1008.0: 2}, {1034.0: 2}, {1048.0: 2},
{1057.0: 2}, {1066.0: 2}, {1073.0: 2}, {1109.0: 2}, {1173.0: 2}, {1191.0: 2}, {1203.
0: 2}, {1206.0: 2}, {1230.0: 2}, {1266.0: 2}, {1287.0: 2}, {1290.0: 2}, {1299.0: 2},
{1308.0: 2}, {1316.0: 2}, {1317.0: 2}, {1338.0: 2}, {1340.0: 2}, {1355.0: 2}, {1359.
0: 2}, {1392.0: 2}, {1401.0: 2}, {1403.0: 2}, {1422.0: 2}, {1434.0: 2}, {1441.0: 2},
{1463.0: 2}, {1469.0: 2}, {1472.0: 2}, {1478.0: 2}, {1492.0: 2}, {1500.0: 2}, {1508.
0: 2}, {1531.0: 2}, {1539.0: 2}, {1548.0: 2}, {1581.0: 2}, {1611.0: 2}, {1623.0: 2},
{1629.0: 2}, {1654.0: 2}, {1663.0: 2}, {1669.0: 2}, {1680.0: 2}, {1682.0: 2}, {1683.
0: 2}, {1729.0: 2}, {1750.0: 2}, {1756.0: 2}, {1758.0: 2}, {1767.0: 2}, {1769.0: 2},
{1775.0: 2}, {1790.0: 2}, {1792.0: 2}, {1829.0: 2}, {1844.0: 2}, {1868.0: 2}, {1886.
0: 2}, {1889.0: 3}, {1908.0: 3}, {1909.0: 3}, {1925.0: 3}, {1936.0: 3}, {1942.0: 3},
{1943.0: 3}, {1946.0: 3}, {1954.0: 3}, {1966.0: 3}, {2009.0: 3}, {2056.0: 3}, {2058.
0: 3}, {2091.0: 3}, {2094.0: 3}, {2104.0: 3}, {2117.0: 3}, {2143.0: 3}, {2150.0: 3},
{2183.0: 3}, {2193.0: 3}, {2217.0: 3}, {2218.0: 3}, {2220.0: 3}, {2222.0: 3}, {2226.
0: 3}, {2245.0: 3}, {2260.0: 3}, {2264.0: 3}, {2293.0: 3}, {2297.0: 3}, {2324.0: 3},
{2350.0: 3}, {2353.0: 3}, {2354.0: 3}, {2395.0: 3}, {2409.0: 3}, {2417.0: 3}, {2418.
0: 3}, {2419.0: 3}, {2420.0: 3}, {2421.0: 3}, {2439.0: 3}, {2468.0: 3}, {2520.0: 3},
{2524.0: 3}, {2538.0: 3}, {2541.0: 3}, {2547.0: 3}, {2548.0: 3}, {2575.0: 3}, {2585.
0: 3}, {2607.0: 3}, {2622.0: 3}, {2638.0: 3}, {2660.0: 3}, {2681.0: 3}, {2682.0: 3},
{2684.0: 3}, {2688.0: 3}, {2692.0: 3}, {2731.0: 3}, {2748.0: 3}, {2750.0: 3}, {2757.
0: 3}, {2765.0: 3}, {2773.0: 3}, {2774.0: 3}, {2822.0: 3}, {2825.0: 3}, {2826.0: 3},
{2835.0: 3}, {2850.0: 3}, {2857.0: 3}, {2881.0: 3}, {2896.0: 3}, {2916.0: 3}, {2922.
0: 3}, {2923.0: 3}, {2942.0: 4}, {2951.0: 4}, {2954.0: 4}, {2987.0: 4}, {2989.0: 4},
In [128… pq['airline.airline_id'].head()
Out[128…
In [133… def balance_partitions(keys, num_partitions):
ac = keys.cumsum()
#sum of the entire array
partsum = ac[-1]//num_partitions
#generates the cumulative sums of each part
cum_part_sums = np.array(range(1,p))*partsum
#finds the indices
inds = np.searchsorted(ac,cum_part_sums)
#split into approximately equal-sum arrays
parts = np.split(arr,inds)
return parts
In [134… keys = list(pq['airline.airline_id'])
num_partitions=7
In [135… print(balance_partitions(keys, num_partitions))
- 6. {2990.0: 4}, {2993.0: 4}, {2994.0: 4}, {3000.0: 4}, {3021.0: 4}, {3026.0: 4}, {3029.
0: 4}, {3052.0: 4}, {3081.0: 4}, {3090.0: 4}, {3097.0: 4}, {3123.0: 4}, {3126.0: 4},
{3148.0: 4}, {3163.0: 4}, {3179.0: 4}, {3197.0: 4}, {3200.0: 4}, {3201.0: 4}, {3210.
0: 4}, {3233.0: 4}, {3251.0: 4}, {3258.0: 4}, {3287.0: 4}, {3290.0: 4}, {3320.0: 4},
{3329.0: 4}, {3342.0: 4}, {3354.0: 4}, {3370.0: 4}, {3378.0: 4}, {3386.0: 4}, {3391.
0: 4}, {3392.0: 4}, {3393.0: 4}, {3432.0: 4}, {3437.0: 4}, {3463.0: 4}, {3490.0: 4},
{3498.0: 4}, {3534.0: 4}, {3539.0: 4}, {3545.0: 4}, {3547.0: 4}, {3574.0: 4}, {3589.
0: 4}, {3613.0: 4}, {3618.0: 4}, {3637.0: 4}, {3652.0: 4}, {3661.0: 4}, {3674.0: 4},
{3721.0: 4}, {3734.0: 4}, {3737.0: 4}, {3740.0: 4}, {3754.0: 4}, {3764.0: 4}, {3776.
0: 4}, {3778.0: 4}, {3781.0: 4}, {3783.0: 4}, {3788.0: 4}, {3805.0: 4}, {3811.0: 4},
{3826.0: 4}, {3834.0: 4}, {3835.0: 4}, {3850.0: 4}, {3856.0: 4}, {3857.0: 4}, {3865.
0: 4}, {3871.0: 4}, {3926.0: 4}, {3935.0: 5}, {3952.0: 5}, {3969.0: 5}, {3976.0: 5},
{4021.0: 5}, {4026.0: 5}, {4031.0: 5}, {4044.0: 5}, {4066.0: 5}, {4089.0: 5}, {4091.
0: 5}, {4165.0: 5}, {4178.0: 5}, {4234.0: 5}, {4248.0: 5}, {4255.0: 5}, {4259.0: 5},
{4292.0: 5}, {4296.0: 5}, {4304.0: 5}, {4305.0: 5}, {4311.0: 5}, {4319.0: 5}, {4329.
0: 5}, {4335.0: 5}, {4349.0: 5}, {4356.0: 5}, {4375.0: 5}, {4388.0: 5}, {4429.0: 5},
{4435.0: 5}, {4436.0: 5}, {4438.0: 5}, {4454.0: 5}, {4475.0: 5}, {4496.0: 5}, {4513.
0: 5}, {4521.0: 5}, {4533.0: 5}, {4547.0: 5}, {4550.0: 5}, {4559.0: 5}, {4573.0: 5},
{4599.0: 5}, {4608.0: 5}, {4609.0: 5}, {4611.0: 5}, {4687.0: 5}, {4691.0: 5}, {4735.
0: 5}, {4737.0: 5}, {4740.0: 5}, {4750.0: 5}, {4752.0: 5}, {4797.0: 5}, {4805.0: 5},
{4808.0: 5}, {4822.0: 5}, {4840.0: 5}, {4863.0: 5}, {4867.0: 5}, {4869.0: 5}, {4870.
0: 5}, {4897.0: 5}, {4936.0: 5}, {4937.0: 5}, {4940.0: 5}, {4947.0: 5}, {4951.0: 5},
{4965.0: 5}, {5002.0: 5}, {5013.0: 5}, {5016.0: 5}, {5038.0: 5}, {5039.0: 5}, {5041.
0: 5}, {5067.0: 5}, {5083.0: 5}, {5085.0: 5}, {5097.0: 6}, {5133.0: 6}, {5156.0: 6},
{5179.0: 6}, {5188.0: 6}, {5209.0: 6}, {5234.0: 6}, {5265.0: 6}, {5281.0: 6}, {5282.
0: 6}, {5297.0: 6}, {5309.0: 6}, {5325.0: 6}, {5331.0: 6}, {5333.0: 6}, {5347.0: 6},
{5354.0: 6}, {5360.0: 6}, {5368.0: 6}, {5399.0: 6}, {5416.0: 6}, {5439.0: 6}, {5461.
0: 6}, {5479.0: 6}, {5484.0: 6}, {5496.0: 6}, {5521.0: 6}, {5523.0: 6}, {5651.0: 6},
{5813.0: 6}, {5982.0: 6}, {6557.0: 6}, {8359.0: 6}, {8463.0: 6}, {8576.0: 6}, {8745.
0: 6}, {8809.0: 6}, {9082.0: 6}, {9531.0: 6}, {9541.0: 6}, {9620.0: 6}, {9666.0: 6},
{9764.0: 6}, {9784.0: 6}, {9809.0: 6}, {9810.0: 6}, {9818.0: 6}, {9828.0: 6}, {9829.
0: 6}, {10121.0: 6}, {10122.0: 6}, {10128.0: 6}, {10646.0: 6}, {10650.0: 6}, {10675.
0: 6}, {10737.0: 6}, {10739.0: 6}, {10741.0: 6}, {10758.0: 6}, {10765.0: 6}, {10776.
0: 6}, {10800.0: 6}, {10912.0: 6}, {10955.0: 6}, {11741.0: 6}, {11763.0: 6}, {11794.
0: 6}, {11806.0: 6}, {11808.0: 6}, {11811.0: 6}, {11814.0: 6}, {11838.0: 6}, {11857.
0: 6}, {11948.0: 6}, {11963.0: 6}, {12978.0: 6}, {13088.0: 6}, {13108.0: 6}, {13200.
0: 6}, {13335.0: 7}, {13704.0: 7}, {13757.0: 7}, {13899.0: 7}, {13983.0: 7}, {14061.
0: 7}, {14118.0: 7}, {14485.0: 7}, {14849.0: 7}, {15814.0: 7}, {15837.0: 7}, {15893.
0: 7}, {15999.0: 7}, {16120.0: 7}, {16133.0: 7}, {16136.0: 7}, {16149.0: 7}, {16150.
0: 7}, {16262.0: 7}, {16415.0: 7}, {16475.0: 7}, {16508.0: 7}, {16615.0: 7}, {16624.
0: 7}, {16660.0: 7}, {16707.0: 7}, {16725.0: 7}, {16726.0: 7}, {16844.0: 7}, {16882.
0: 7}, {16942.0: 7}, {16960.0: 7}, {16963.0: 7}, {17023.0: 7}, {17083.0: 7}, {17094.
0: 7}, {17095.0: 7}, {17099.0: 7}, {17408.0: 7}, {17519.0: 7}, {17675.0: 7}, {17885.
0: 7}, {17891.0: 7}, {18169.0: 7}, {18232.0: 7}, {18529.0: 7}, {18543.0: 7}, {18553.
0: 7}, {18700.0: 7}, {18732.0: 7}, {18825.0: 7}, {18828.0: 7}, {18944.0: 7}, {18946.
0: 7}, {19016.0: 7}, {19305.0: 7}, {19582.0: 7}, {19610.0: 7}, {19676.0: 7}, {19804.
0: 7}, {19810.0: 7}, {19944.0: 7}, {20004.0: 7}, {20047.0: 7}, {20160.0: 7}, {20270.
0: 7}, {20565.0: 7}, {20577.0: 7}, {20686.0: 7}, {20710.0: 7}, {20963.0: 7}, {20976.
0: 7}, {21012.0: 7}]
In [ ]: