Numba-compiled Python UDFs for Impala (Impala Meetup 5/20/14)

1
Compiled Python UDFs for Impala
Uri Laserson
20 May 2014

Impala User-defined Functions (UDFs)
• Tuple => Scalar value
• Substring
• sin, cos, pow, …
• Machine-learning models
• Supports Hive UDFs (Java)
• Relatively unpleasurable
• Slower
• Impala (native) UDFs
• C++ interface designed for efficiency
• Similar to Postgres UDFs
• Runs any LLVM-compiled code
2

LLVM compiler infrastructure
3

LLVM: C++ example
4
bool StringEq(FunctionContext* context,
const StringVal& arg1,
const StringVal& arg2) {
if (arg1.is_null != arg2.is_null)
return false;
if (arg1.is_null)
return true;
if (arg1.len != arg2.len)
return false;
return (arg1.ptr == arg2.ptr) ||
memcmp(arg1.ptr, arg2.ptr, arg1.len) == 0;
}

LLVM: IR output
5
; ModuleID = '<stdin>'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"
%"class.impala_udf::FunctionContext" = type { %"class.impala::FunctionContextImpl"* }
%"class.impala::FunctionContextImpl" = type opaque
%"struct.impala_udf::StringVal" = type { %"struct.impala_udf::AnyVal", i32, i8* }
%"struct.impala_udf::AnyVal" = type { i8 }
; Function Attrs: nounwind readonly ssp uwtable
define zeroext i1 @_Z8StringEqPN10impala_udf15FunctionContextERKNS_9StringValES4_(%"class.impala_udf::FunctionContext"* nocapture %context, %"struct.impala_udf::StringVal"*
nocapture %arg1, %"struct.impala_udf::StringVal"* nocapture %arg2) #0 {
entry:
%is_null = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 0, i32 0
%0 = load i8* %is_null, align 1, !tbaa !0, !range !3
%is_null1 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 0, i32 0
%1 = load i8* %is_null1, align 1, !tbaa !0, !range !3
%cmp = icmp eq i8 %0, %1
br i1 %cmp, label %if.end, label %return
if.end: ; preds = %entry
%tobool = icmp eq i8 %0, 0
br i1 %tobool, label %if.end7, label %return
if.end7: ; preds = %if.end
%len = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 1
%2 = load i32* %len, align 4, !tbaa !4
%len8 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 1
%3 = load i32* %len8, align 4, !tbaa !4
%cmp9 = icmp eq i32 %2, %3
br i1 %cmp9, label %if.end11, label %return
if.end11: ; preds = %if.end7
%ptr = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 2
%4 = load i8** %ptr, align 8, !tbaa !5
%ptr12 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 2
%5 = load i8** %ptr12, align 8, !tbaa !5
%cmp13 = icmp eq i8* %4, %5
br i1 %cmp13, label %return, label %lor.rhs
lor.rhs: ; preds = %if.end11
%conv17 = sext i32 %2 to i64
%call = tail call i32 @memcmp(i8* %4, i8* %5, i64 %conv17)
%cmp18 = icmp eq i32 %call, 0
br label %return

LLVM: IR output
6
; ModuleID = '<stdin>'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.0"
%"class.impala_udf::FunctionContext" = type { %"class.impala::FunctionContextImpl"* }
%"class.impala::FunctionContextImpl" = type opaque
%"struct.impala_udf::StringVal" = type { %"struct.impala_udf::AnyVal", i32, i8* }
%"struct.impala_udf::AnyVal" = type { i8 }
; Function Attrs: nounwind readonly ssp uwtable
define zeroext i1 @_Z8StringEqPN10impala_udf15FunctionContextERKNS_9StringValES4_(%"class.impala_udf::FunctionContext"* nocapture %context, %"struct.impala_udf::StringVal"*
nocapture %arg1, %"struct.impala_udf::StringVal"* nocapture %arg2) #0 {
entry:
%is_null = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 0, i32 0
%0 = load i8* %is_null, align 1, !tbaa !0, !range !3
%is_null1 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 0, i32 0
%1 = load i8* %is_null1, align 1, !tbaa !0, !range !3
%cmp = icmp eq i8 %0, %1
br i1 %cmp, label %if.end, label %return
if.end: ; preds = %entry
%tobool = icmp eq i8 %0, 0
br i1 %tobool, label %if.end7, label %return
if.end7: ; preds = %if.end
%len = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 1
%2 = load i32* %len, align 4, !tbaa !4
%len8 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 1
%3 = load i32* %len8, align 4, !tbaa !4
%cmp9 = icmp eq i32 %2, %3
br i1 %cmp9, label %if.end11, label %return
if.end11: ; preds = %if.end7
%ptr = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg1, i64 0, i32 2
%4 = load i8** %ptr, align 8, !tbaa !5
%ptr12 = getelementptr inbounds %"struct.impala_udf::StringVal"* %arg2, i64 0, i32 2
%5 = load i8** %ptr12, align 8, !tbaa !5
%cmp13 = icmp eq i8* %4, %5
br i1 %cmp13, label %return, label %lor.rhs
lor.rhs: ; preds = %if.end11
%conv17 = sext i32 %2 to i64
%call = tail call i32 @memcmp(i8* %4, i8* %5, i64 %conv17)
%cmp18 = icmp eq i32 %call, 0
br label %return

Data type compatibility
7
struct AnyVal {
bool is_null;
};
struct StringVal : public AnyVal {
int len;
uint8_t* ptr;
};
%AnyVal = type { i8 }
%StringVal = type { %AnyVal, i32, i8* }
; or
%StringVal = type { { i8 }, i32, i8* }
C++LLVMIR

Register and execute the function
8
CREATE FUNCTION StringEq(STRING, STRING)
RETURNS BOOLEAN
LOCATION '/path/to/bitcode.ll’
SYMBOL=’StringEq’;
SELECT StringEq(a, b) FROM mytable;

Impyla: Python Library for Impala
• pip install impyla
• DB API v2.0 (PEP 249) compatible
• Prototype sklearn API for Impala ML
• Numba integration (described here)
• See blog post:
http://blog.cloudera.com/blog/2014/04/a-new-
python-client-for-impala/
10

LLVM: Python example
11
@udf(IntVal(FunctionContext, StringVal))
def hour_from_weird_date_format(context, date):
return int(split(date, '-')[1])
ship_udf(cursor, hour_from_weird_data_format,
'/path/to/store/udf.ll', 'my.impala.host')
cur.execute('SELECT hour_from_weird_data_format(date) ’ +
‘AS hour FROM mytable LIMIT 100’)

Model Scoring: BigML on Census Data
12
MLaaS

Model Scoring: BigML on Census Data
13

Example: 100 Node Decision Tree
14
def predict_income(impala_function_context, age, workclass, final_weight, education, education_num, marital_status, occupation, relationship,
race, sex, hours_per_week, native_country, income):
if (marital_status is None):
return '<=50K'
if (marital_status == 'Married-civ-spouse'):
if (education_num is None):
return '<=50K'
if (education_num > 12):
if (hours_per_week is None):
return '>50K'
if (hours_per_week > 31):
if (age is None):
return '>50K'
if (age > 28):
if (education_num > 13):
if (age > 58):
return '>50K'
if (age <= 58):
return '>50K'
if (education_num <= 13):
if (occupation is None):
return '>50K'
if (occupation == 'Exec-managerial'):
return '>50K'
if (occupation != 'Exec-managerial'):
return '>50K'
if (age <= 28):
if (age > 24):
if (occupation is None):
return '<=50K'
if (occupation == 'Tech-support'):
return '>50K'
if (occupation != 'Tech-support'):
return '<=50K'
if (age <= 24):
if (final_weight is None):
return '<=50K'
if (final_weight > 492053):
return '>50K'
if (final_weight <= 492053):
return '<=50K'
if (hours_per_week <= 31):
if (sex is None):
return '<=50K'
if (sex == 'Male'):

Batch Scoring with PySpark
15
# parse the text data
observations = sc.textFile('/path/to/census_data').map(parse_obs)
# perform batch scoring
predictions = observations.map(lambda tup: predict_income(*tup))
# trigger computation
distinct = predictions.distinct().collect()

Batch Scoring with Impala
16
# compile the scoring function
predict_income = udf(signature)(predict_income)
ship_udf(cursor, predict_income, ...)
# perform batch scoring
cursor.execute(‘SELECT DISTINCT predict_income(age, ... ) ‘ +
‘FROM census_text’)
distinct = cursor.fetchall()

Execution Time
17
execution_time =
per_job_overhead +
N * ( per_record_exec + memcmp_exec )

PySpark vs. Impala Performance
18
Tree size
(nodes)
Spark
execution
time (s)
Impala
execution
time (s)
Fold
differenc
e
Impala
compilati
on time
(s)
Bytecode
size
(bytes)
Percent
memcmp
nodes
0 160 9 17x 0 4
100 175 22 8x 1 2254 22%
500 178 27 7x 4 9803 35%
1000 184 32 6x 16 23495 34%
1500 188 35 5x 18 28301 34%
2000 196 37 5x 31 42442 33%

Execution Time
19
execution_time =
per_job_overhead +
N * ( per_record_exec + memcmp_exec )
Spark: 24 threads / node
[ ]
Impala: 1 thread / node

PySpark vs. Impala Performance
20
Tree size
(nodes)
Spark
execution
time (s)
Impala
execution
time (s)
Fold
differenc
e
Impala
compilati
on time
(s)
Bytecode
size
(bytes)
Percent
memcmp
nodes
0 160 9 17x 0 4
100 175 22 8x 1 2254 22%
500 178 27 7x 4 9803 35%
1000 184 32 6x 16 23495 34%
1500 188 35 5x 18 28301 34%
2000 196 37 5x 31 42442 33%

Current Status
• Support for all Impala UDF data types (e.g., IntVal,
StringVal, etc.)
• Support for casts to/from primitive types:
• Any operations on primitives should work on Impala types
• Support for NULL types as Python None
• Proof-of-principle support for Python string module
• len
• split
• Concatenation
• Call out to any extern C functions
• Proposed directions
• Array handling
• Numpy support
• What else?
21

UDFs with Impala + Numba
• Simplicity of Python interface/syntax
• Performance of compiled language like C++
• Developed at: https://github.com/cloudera/impyla
• Please try it and tell us what features would be useful
• Please contribute!
22
pip install impyla

Numba-compiled Python UDFs for Impala (Impala Meetup 5/20/14)

Empfohlen

Empfohlen

Weitere ähnliche Inhalte

Was ist angesagt?

Was ist angesagt? (20)

Ähnlich wie Numba-compiled Python UDFs for Impala (Impala Meetup 5/20/14)

Ähnlich wie Numba-compiled Python UDFs for Impala (Impala Meetup 5/20/14) (20)

Mehr von Uri Laserson

Mehr von Uri Laserson (6)

Kürzlich hochgeladen

Kürzlich hochgeladen (20)

Numba-compiled Python UDFs for Impala (Impala Meetup 5/20/14)

Hinweis der Redaktion