Assignment 3.pdf

D

Big Data

1
Assignment 3
September 19, 2021
Assignment 3
Import libraries and define common helper functions
2
Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz
[5]: [{'airline': {'airline_id': 410,
'name': 'Aerocondor',
'alias': 'ANA All Nippon Airways',
'iata': '2B',
'icao': 'ARD',
'callsign': 'AEROCONDOR',
'country': 'Portugal',
'active': True},
'src_airport': {'airport_id': 2965,
'name': 'Sochi International Airport',
'city': 'Sochi',
'country': 'Russia',
'iata': 'AER',
'icao': 'URSS',
'latitude': 43.449902,
'longitude': 39.9566,
'altitude': 89,
'timezone': 3.0,
'dst': 'N',
'tz_id': 'Europe/Moscow',
'type': 'airport',
'source': 'OurAirports'},
'dst_airport': {'airport_id': 2990,
'name': 'Kazan International Airport',
'city': 'Kazan',
'country': 'Russia',
'iata': 'KZN',
'icao': 'UWKD',
'latitude': 55.606201171875,
'longitude': 49.278701782227,
'altitude': 411,
'timezone': 3.0,
'dst': 'N',
3
'tz_id': 'Europe/Moscow',
'type': 'airport',
'source': 'OurAirports'},
'codeshare': False,
Collecting genson
Using cached genson-1.2.2-py2.py3-none-any.whl
Installing collected packages: genson
Successfully installed genson-1.2.2
{
"$schema": "http://json-schema.org/schema#",
"anyOf": [
{
"type": "object"
},
{
"type": "array",
"items": {
"type": "object",
"properties": {
"airline": {
"type": "object",
"properties": {
"airline_id": {
"type": "integer"
},
"name": {
"type": "string"
},
"alias": {
"type": "string"
},
"iata": {
"type": "string"
},
"icao": {
4
"type": "string"
},
"callsign": {
"type": "string"
},
"country": {
"type": "string"
},
"active": {
"type": "boolean"
}
},
"required": [
"active",
"airline_id",
"alias",
"callsign",
"country",
"iata",
"icao",
"name"
]
},
"src_airport": {
"anyOf": [
{
"type": "null"
},
{
"type": "object",
"properties": {
"airport_id": {
"type": "integer"
},
"name": {
"type": "string"
},
"city": {
"type": "string"
},
"country": {
"type": "string"
},
"iata": {
"type": "string"
},
"icao": {
5
"type": "string"
},
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
"altitude": {
"type": "integer"
},
"timezone": {
"type": "number"
},
"dst": {
"type": "string"
},
"tz_id": {
"type": "string"
},
"type": {
"type": "string"
},
"source": {
"type": "string"
}
},
"required": [
"airport_id",
"altitude",
"city",
"country",
"dst",
"iata",
"icao",
"latitude",
"longitude",
"name",
"source",
"timezone",
"type",
"tz_id"
]
}
]
},
"dst_airport": {
6
"anyOf": [
{
"type": "null"
},
{
"type": "object",
"properties": {
"airport_id": {
"type": "integer"
},
"name": {
"type": "string"
},
"city": {
"type": "string"
},
"country": {
"type": "string"
},
"iata": {
"type": "string"
},
"icao": {
"type": "string"
},
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
"altitude": {
"type": "integer"
},
"timezone": {
"type": "number"
},
"dst": {
"type": "string"
},
"tz_id": {
"type": "string"
},
"type": {
"type": "string"
},
"source": {
7
"type": "string"
}
},
"required": [
"airport_id",
"altitude",
"city",
"country",
"dst",
"iata",
"icao",
"latitude",
"longitude",
"name",
"source",
"timezone",
"type",
"tz_id"
]
}
]
},
"codeshare": {
"type": "boolean"
},
"equipment": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [
"airline",
"codeshare",
"dst_airport",
"equipment",
"src_airport"
]
}
}
]
}
8
3.1. a JSON Schema
{'$schema': 'http://json-schema.org/draft-04/schema#', 'type':
'object', 'properties': {'airline': {'type': 'object', 'properties':
{'active': {'type': 'boolean'}, 'airline_id': {'type': 'integer'},
'alias': {'type': 'string'},
'callsign': {'type': 'string'}, 'country': {'type': 'string'}, 'iata':
{'type':
'string'}, 'icao': {'type': 'string'}, 'name': {'type': 'string'}},
'required': ['active', 'airline_id', 'alias', 'callsign', 'country',
'iata', 'icao',
'name']}, 'codeshare': {'type': 'boolean'}, 'dst_airport': {'type':
['object',
'null'], 'properties': {'airport_id': {'type': 'integer'}, 'altitude':
{'type': 'integer'}, 'city': {'type': 'string'}, 'country': {'type':
'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'},
'icao': {'type': 'string'}, 'latitude': {'type': 'number'},
'longitude': {'type': 'number'}, 'name':
{'type': 'string'}, 'source': {'type': 'string'}, 'timezone': {'type':
'number'}, 'type': {'type': 'string'}, 'tz_id': {'type': 'string'}},
'required':
9
['airport_id', 'altitude', 'city', 'country', 'dst', 'iata', 'icao',
'latitude', 'longitude', 'name', 'source', 'timezone', 'type',
'tz_id']}, 'equipment':
{'type': 'array', 'items': [{'type': 'string'}]}, 'src_airport':
{'type':
['object', 'null'], 'properties': {'airport_id': {'type': 'integer'},
'altitude': {'type': 'integer'}, 'city': {'type': 'string'},
'country': {'type':
'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'},
'icao': {'type': 'string'}, 'latitude': {'type': 'number'},
'longitude': {'type':
'number'}, 'name': {'type': 'string'}, 'source': {'type': 'string'},
'timezone':
{'type': 'number'}, 'type': {'type': 'string'}, 'tz_id': {'type':
'string'}},
'required': ['airport_id', 'altitude', 'city', 'country', 'dst',
'iata', 'icao',
'latitude', 'longitude', 'name', 'source', 'timezone', 'type',
'tz_id']}},
'required': ['airline', 'codeshare', 'dst_airport', 'equipment',
'src_airport']}
3.1.b Avro
/home/adil/dsc650/dsc650/assignments/assignment03/results/routes.avro
10
3.1.c Parquet
pyarrow.Table airline: struct<airline_id: int64, name: string,
alias: string, iata: string, icao: string, callsign: string,
country: string, active: bool>
child 0, airline_id:
int64 child 1, name:
string child 2, alias:
string child 3, iata:
string child 4, icao:
string child 5,
callsign: string child
6, country: string
child 7, active: bool
src_airport: struct<airport_id: int64, name: string, city: string,
country: string, iata: string, icao: string, latitude: double,
longitude: double, altitude: int64, timezone: double, dst: string,
tz_id: string, type: string, source: string>
child 0, airport_id:
int64 child 1, name:
string child 2, city:
string child 3,
country: string child
4, iata: string child
5, icao: string child
6, latitude: double
,→
11
child 7, longitude:
double child 8,
altitude: int64 child
9, timezone: double
child 10, dst: string
child 11, tz_id:
string child 12, type:
string child 13,
source: string
dst_airport: struct<airport_id: int64, name: string, city: string,
country: string, iata: string, icao: string, latitude: double,
longitude: double, altitude: int64, timezone: double, dst: string,
tz_id: string, type: string, source: string>
child 0, airport_id:
int64 child 1, name:
string child 2, city:
string child 3,
country: string child
4, iata: string child
5, icao: string child
6, latitude: double
child 7, longitude:
double child 8,
altitude: int64 child
9, timezone: double
child 10, dst: string
child 11, tz_id:
string child 12, type:
string child 13,
source: string
codeshare: bool
equipment: list<item:
string> child 0, item:
string
12
3.1.d Protocol Buffers
13
14
3.2.a Simple Geohash Index
15
3.2.b Simple Search Feature
16
20000.0 km
Eppley
Airfield
3.1 e Output Sizes
[1]: import os routes_avro_size =
os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.avro")
print (routes_avro_size, "bytes")
19646227 bytes
[56]: routes_parquet_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.parquet"
) print (routes_parquet_size, "bytes")
2327907 bytes
[57]: routes_snappy_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.pb.snappy"
) print (routes_snappy_size, "bytes")
3705406 bytes
[58]: routes_pb_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.pb")
print (routes_pb_size, "bytes")
22270594 bytes
[59]: routes_JSONSCHEMA_size = os.path.getsize("/home/adil/dsc650/dsc650/
,→assignments/assignment03/schemas/routes-
schema.json") print (routes_JSONSCHEMA_size,
"bytes")
3461 bytes
[60]: routes_JSONSCHEMA_gzsize =
os.path.getsize("/home/adil/dsc650/data/processed/
17
,→openflights/routes.jsonl.gz")
print (routes_JSONSCHEMA_gzsize,
"bytes")
3327145 bytes
[ ]:

Recomendados

Assignment7.pdf von
Assignment7.pdfAssignment7.pdf
Assignment7.pdfdash41
37 views6 Folien
Assignment 6.3.pdf von
Assignment 6.3.pdfAssignment 6.3.pdf
Assignment 6.3.pdfdash41
16 views2 Folien
Assignment 6.2b.pdf von
Assignment 6.2b.pdfAssignment 6.2b.pdf
Assignment 6.2b.pdfdash41
9 views4 Folien
Assignment 6.2a.pdf von
Assignment 6.2a.pdfAssignment 6.2a.pdf
Assignment 6.2a.pdfdash41
10 views4 Folien
Assignment 6.1.pdf von
Assignment 6.1.pdfAssignment 6.1.pdf
Assignment 6.1.pdfdash41
14 views4 Folien
Assignment 5.3.pdf von
Assignment 5.3.pdfAssignment 5.3.pdf
Assignment 5.3.pdfdash41
19 views4 Folien

Más contenido relacionado

Destacado

ChatGPT and the Future of Work - Clark Boyd von
ChatGPT and the Future of Work - Clark Boyd ChatGPT and the Future of Work - Clark Boyd
ChatGPT and the Future of Work - Clark Boyd Clark Boyd
28.7K views69 Folien
Getting into the tech field. what next von
Getting into the tech field. what next Getting into the tech field. what next
Getting into the tech field. what next Tessa Mero
6.7K views22 Folien
Google's Just Not That Into You: Understanding Core Updates & Search Intent von
Google's Just Not That Into You: Understanding Core Updates & Search IntentGoogle's Just Not That Into You: Understanding Core Updates & Search Intent
Google's Just Not That Into You: Understanding Core Updates & Search IntentLily Ray
7K views99 Folien
How to have difficult conversations von
How to have difficult conversations How to have difficult conversations
How to have difficult conversations Rajiv Jayarajah, MAppComm, ACC
5.7K views19 Folien
Introduction to Data Science von
Introduction to Data ScienceIntroduction to Data Science
Introduction to Data ScienceChristy Abraham Joy
82.6K views51 Folien
Time Management & Productivity - Best Practices von
Time Management & Productivity -  Best PracticesTime Management & Productivity -  Best Practices
Time Management & Productivity - Best PracticesVit Horky
169.8K views42 Folien

Destacado(20)

ChatGPT and the Future of Work - Clark Boyd von Clark Boyd
ChatGPT and the Future of Work - Clark Boyd ChatGPT and the Future of Work - Clark Boyd
ChatGPT and the Future of Work - Clark Boyd
Clark Boyd28.7K views
Getting into the tech field. what next von Tessa Mero
Getting into the tech field. what next Getting into the tech field. what next
Getting into the tech field. what next
Tessa Mero6.7K views
Google's Just Not That Into You: Understanding Core Updates & Search Intent von Lily Ray
Google's Just Not That Into You: Understanding Core Updates & Search IntentGoogle's Just Not That Into You: Understanding Core Updates & Search Intent
Google's Just Not That Into You: Understanding Core Updates & Search Intent
Lily Ray7K views
Time Management & Productivity - Best Practices von Vit Horky
Time Management & Productivity -  Best PracticesTime Management & Productivity -  Best Practices
Time Management & Productivity - Best Practices
Vit Horky169.8K views
The six step guide to practical project management von MindGenius
The six step guide to practical project managementThe six step guide to practical project management
The six step guide to practical project management
MindGenius36.7K views
Beginners Guide to TikTok for Search - Rachel Pearson - We are Tilt __ Bright... von RachelPearson36
Beginners Guide to TikTok for Search - Rachel Pearson - We are Tilt __ Bright...Beginners Guide to TikTok for Search - Rachel Pearson - We are Tilt __ Bright...
Beginners Guide to TikTok for Search - Rachel Pearson - We are Tilt __ Bright...
RachelPearson3612.8K views
Unlocking the Power of ChatGPT and AI in Testing - A Real-World Look, present... von Applitools
Unlocking the Power of ChatGPT and AI in Testing - A Real-World Look, present...Unlocking the Power of ChatGPT and AI in Testing - A Real-World Look, present...
Unlocking the Power of ChatGPT and AI in Testing - A Real-World Look, present...
Applitools55.5K views
12 Ways to Increase Your Influence at Work von GetSmarter
12 Ways to Increase Your Influence at Work12 Ways to Increase Your Influence at Work
12 Ways to Increase Your Influence at Work
GetSmarter401.7K views
Ride the Storm: Navigating Through Unstable Periods / Katerina Rudko (Belka G... von DevGAMM Conference
Ride the Storm: Navigating Through Unstable Periods / Katerina Rudko (Belka G...Ride the Storm: Navigating Through Unstable Periods / Katerina Rudko (Belka G...
Ride the Storm: Navigating Through Unstable Periods / Katerina Rudko (Belka G...
DevGAMM Conference3.6K views
Barbie - Brand Strategy Presentation von Erica Santiago
Barbie - Brand Strategy PresentationBarbie - Brand Strategy Presentation
Barbie - Brand Strategy Presentation
Erica Santiago25.1K views
Good Stuff Happens in 1:1 Meetings: Why you need them and how to do them well von Saba Software
Good Stuff Happens in 1:1 Meetings: Why you need them and how to do them wellGood Stuff Happens in 1:1 Meetings: Why you need them and how to do them well
Good Stuff Happens in 1:1 Meetings: Why you need them and how to do them well
Saba Software25.3K views
Introduction to C Programming Language von Simplilearn
Introduction to C Programming LanguageIntroduction to C Programming Language
Introduction to C Programming Language
Simplilearn8.5K views
The Pixar Way: 37 Quotes on Developing and Maintaining a Creative Company (fr... von Palo Alto Software
The Pixar Way: 37 Quotes on Developing and Maintaining a Creative Company (fr...The Pixar Way: 37 Quotes on Developing and Maintaining a Creative Company (fr...
The Pixar Way: 37 Quotes on Developing and Maintaining a Creative Company (fr...
Palo Alto Software88.4K views
9 Tips for a Work-free Vacation von Weekdone.com
9 Tips for a Work-free Vacation9 Tips for a Work-free Vacation
9 Tips for a Work-free Vacation
Weekdone.com7.2K views
How to Map Your Future von SlideShop.com
How to Map Your FutureHow to Map Your Future
How to Map Your Future
SlideShop.com275.1K views

Assignment 3.pdf

  • 1. 1 Assignment 3 September 19, 2021 Assignment 3 Import libraries and define common helper functions
  • 2. 2 Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz [5]: [{'airline': {'airline_id': 410, 'name': 'Aerocondor', 'alias': 'ANA All Nippon Airways', 'iata': '2B', 'icao': 'ARD', 'callsign': 'AEROCONDOR', 'country': 'Portugal', 'active': True}, 'src_airport': {'airport_id': 2965, 'name': 'Sochi International Airport', 'city': 'Sochi', 'country': 'Russia', 'iata': 'AER', 'icao': 'URSS', 'latitude': 43.449902, 'longitude': 39.9566, 'altitude': 89, 'timezone': 3.0, 'dst': 'N', 'tz_id': 'Europe/Moscow', 'type': 'airport', 'source': 'OurAirports'}, 'dst_airport': {'airport_id': 2990, 'name': 'Kazan International Airport', 'city': 'Kazan', 'country': 'Russia', 'iata': 'KZN', 'icao': 'UWKD', 'latitude': 55.606201171875, 'longitude': 49.278701782227, 'altitude': 411, 'timezone': 3.0, 'dst': 'N',
  • 3. 3 'tz_id': 'Europe/Moscow', 'type': 'airport', 'source': 'OurAirports'}, 'codeshare': False, Collecting genson Using cached genson-1.2.2-py2.py3-none-any.whl Installing collected packages: genson Successfully installed genson-1.2.2 { "$schema": "http://json-schema.org/schema#", "anyOf": [ { "type": "object" }, { "type": "array", "items": { "type": "object", "properties": { "airline": { "type": "object", "properties": { "airline_id": { "type": "integer" }, "name": { "type": "string" }, "alias": { "type": "string" }, "iata": { "type": "string" }, "icao": {
  • 4. 4 "type": "string" }, "callsign": { "type": "string" }, "country": { "type": "string" }, "active": { "type": "boolean" } }, "required": [ "active", "airline_id", "alias", "callsign", "country", "iata", "icao", "name" ] }, "src_airport": { "anyOf": [ { "type": "null" }, { "type": "object", "properties": { "airport_id": { "type": "integer" }, "name": { "type": "string" }, "city": { "type": "string" }, "country": { "type": "string" }, "iata": { "type": "string" }, "icao": {
  • 5. 5 "type": "string" }, "latitude": { "type": "number" }, "longitude": { "type": "number" }, "altitude": { "type": "integer" }, "timezone": { "type": "number" }, "dst": { "type": "string" }, "tz_id": { "type": "string" }, "type": { "type": "string" }, "source": { "type": "string" } }, "required": [ "airport_id", "altitude", "city", "country", "dst", "iata", "icao", "latitude", "longitude", "name", "source", "timezone", "type", "tz_id" ] } ] }, "dst_airport": {
  • 6. 6 "anyOf": [ { "type": "null" }, { "type": "object", "properties": { "airport_id": { "type": "integer" }, "name": { "type": "string" }, "city": { "type": "string" }, "country": { "type": "string" }, "iata": { "type": "string" }, "icao": { "type": "string" }, "latitude": { "type": "number" }, "longitude": { "type": "number" }, "altitude": { "type": "integer" }, "timezone": { "type": "number" }, "dst": { "type": "string" }, "tz_id": { "type": "string" }, "type": { "type": "string" }, "source": {
  • 7. 7 "type": "string" } }, "required": [ "airport_id", "altitude", "city", "country", "dst", "iata", "icao", "latitude", "longitude", "name", "source", "timezone", "type", "tz_id" ] } ] }, "codeshare": { "type": "boolean" }, "equipment": { "type": "array", "items": { "type": "string" } } }, "required": [ "airline", "codeshare", "dst_airport", "equipment", "src_airport" ] } } ] }
  • 8. 8 3.1. a JSON Schema {'$schema': 'http://json-schema.org/draft-04/schema#', 'type': 'object', 'properties': {'airline': {'type': 'object', 'properties': {'active': {'type': 'boolean'}, 'airline_id': {'type': 'integer'}, 'alias': {'type': 'string'}, 'callsign': {'type': 'string'}, 'country': {'type': 'string'}, 'iata': {'type': 'string'}, 'icao': {'type': 'string'}, 'name': {'type': 'string'}}, 'required': ['active', 'airline_id', 'alias', 'callsign', 'country', 'iata', 'icao', 'name']}, 'codeshare': {'type': 'boolean'}, 'dst_airport': {'type': ['object', 'null'], 'properties': {'airport_id': {'type': 'integer'}, 'altitude': {'type': 'integer'}, 'city': {'type': 'string'}, 'country': {'type': 'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'}, 'icao': {'type': 'string'}, 'latitude': {'type': 'number'}, 'longitude': {'type': 'number'}, 'name': {'type': 'string'}, 'source': {'type': 'string'}, 'timezone': {'type': 'number'}, 'type': {'type': 'string'}, 'tz_id': {'type': 'string'}}, 'required':
  • 9. 9 ['airport_id', 'altitude', 'city', 'country', 'dst', 'iata', 'icao', 'latitude', 'longitude', 'name', 'source', 'timezone', 'type', 'tz_id']}, 'equipment': {'type': 'array', 'items': [{'type': 'string'}]}, 'src_airport': {'type': ['object', 'null'], 'properties': {'airport_id': {'type': 'integer'}, 'altitude': {'type': 'integer'}, 'city': {'type': 'string'}, 'country': {'type': 'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'}, 'icao': {'type': 'string'}, 'latitude': {'type': 'number'}, 'longitude': {'type': 'number'}, 'name': {'type': 'string'}, 'source': {'type': 'string'}, 'timezone': {'type': 'number'}, 'type': {'type': 'string'}, 'tz_id': {'type': 'string'}}, 'required': ['airport_id', 'altitude', 'city', 'country', 'dst', 'iata', 'icao', 'latitude', 'longitude', 'name', 'source', 'timezone', 'type', 'tz_id']}}, 'required': ['airline', 'codeshare', 'dst_airport', 'equipment', 'src_airport']} 3.1.b Avro /home/adil/dsc650/dsc650/assignments/assignment03/results/routes.avro
  • 10. 10 3.1.c Parquet pyarrow.Table airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao: string, callsign: string, country: string, active: bool> child 0, airline_id: int64 child 1, name: string child 2, alias: string child 3, iata: string child 4, icao: string child 5, callsign: string child 6, country: string child 7, active: bool src_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, timezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double ,→
  • 11. 11 child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string dst_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, timezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string codeshare: bool equipment: list<item: string> child 0, item: string
  • 13. 13
  • 16. 16 20000.0 km Eppley Airfield 3.1 e Output Sizes [1]: import os routes_avro_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.avro") print (routes_avro_size, "bytes") 19646227 bytes [56]: routes_parquet_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.parquet" ) print (routes_parquet_size, "bytes") 2327907 bytes [57]: routes_snappy_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.pb.snappy" ) print (routes_snappy_size, "bytes") 3705406 bytes [58]: routes_pb_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.pb") print (routes_pb_size, "bytes") 22270594 bytes [59]: routes_JSONSCHEMA_size = os.path.getsize("/home/adil/dsc650/dsc650/ ,→assignments/assignment03/schemas/routes- schema.json") print (routes_JSONSCHEMA_size, "bytes") 3461 bytes [60]: routes_JSONSCHEMA_gzsize = os.path.getsize("/home/adil/dsc650/data/processed/