SlideShare ist ein Scribd-Unternehmen logo
1 von 76
RHadoop, Hadoop for R
Scholarly Activity 05-09 change

  50%

37.5%

  25%

12.5%

   0%

-12.5%

 -25%

-37.5%
         R        SAS       SPSS     S-Plus    Stata
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10


    1
        2002   2004   2006       2008   2010
Scholarly Activity 05-09 change

                                                 50%

                                               37.5%

                                                 25%

                                               12.5%
                      Packages
                                                  0%

10000                                          -12.5%

                                                -25%
 1000                                          -37.5%
                                                        R        SAS       SPSS     S-Plus    Stata

  100


   10
                                                    http://r4stats.com/popularity
    1
        2002   2004   2006       2008   2010
David Champagne, CTO
f s
    h d
r
rh d f s


rhb
      ase
rh d f s


rhb
      ase



      rm
  r
rmr
sapply(data, function)

mapreduce(data, map = function)
library(rmr)

mapreduce(…)
Rmr
Rmr




      Java, C++
Rmr




                  Cascading,
      Java, C++
                   Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                     Cascading,
        Java, C++
                      Crunch
Expose MR   Hide MR




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr
 Pydoop, Hadoopy




                               Cascading,
        Java, C++
                                Crunch
Expose MR   Hide MR
                               Hive, Pig




Rmr, Rhipe, Dumbo,
Rmr                            Cascalog,
 Pydoop, Hadoopy           Scalding, Scrunch




                               Cascading,
        Java, C++
                                Crunch
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
mapreduce(input, output, map, reduce)
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v)


    reduce = function(k, vv) keyval(k, length(vv))
condition = function(x) x > 10
condition = function(x) x > 10


out = mapreduce(
condition = function(x) x > 10


out = mapreduce(
        input = input,
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
condition = function(x) x > 10


out = mapreduce(
        input = input,
        map = function(k,v)
                 if (condition(v)) keyval(k,v))
x = from.dfs(hdfs.object)

hdfs.object = to.dfs(x)
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;
INSERT OVERWRITE TABLE pv_gender_sum
SELECT pv_users.gender, count (DISTINCT pv_users.userid)
FROM pv_users
GROUP BY pv_users.gender;

mapreduce(input =
  mapreduce(input = "pv_users",
    map = function(k, v) keyval(v['userid'], v['gender']),
    reduce = function(uid, genders)
      lapply(unique(genders), function(g) keyval(NULL, g)),
  output = "pv_gender_sum",
  map = function(x, gender) keyval(gender, 1)
  reduce = function(gender,counts)
             keyval(k,sum(unlist(counts)))
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
kmeans =
  function(points, ncenters, iterations = 10,
           distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) {
    newCenters = kmeans.iter(points, distfun, ncenters = ncenters)
    for(i in 1:iterations) {
      newCenters = kmeans.iter(points, distfun, centers = newCenters)}
    newCenters}


kmeans.iter =
  function(points, distfun, ncenters = dim(centers)[1], centers = NULL) {
    from.dfs(
      mapreduce(
        input = points,
        map = if (is.null(centers)) {
                 function(k,v) keyval(sample(1:ncenters,1),v)}
              else {
                 function(k,v) {
                   distances = apply(centers, 1, function(c) distfun(c,v))
                   keyval(centers[which.min(distances),], v)}},
        reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))),
      to.data.frame = T)}
#!/usr/bin/python
import sys
from math import fabs
from org.apache.pig.scripting import Pig

filename = "student.txt"
k = 4
tolerance = 0.01

MAX_SCORE = 4
MIN_SCORE = 0
MAX_ITERATION = 100

# initial centroid, equally divide the space
initial_centroids = ""
last_centroids = [None] * k
for i in range(k):
    last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE)
    initial_centroids = initial_centroids + str(last_centroids[i])
    if i!=k-1:
        initial_centroids = initial_centroids + ":"

P = Pig.compile("""register udf.jar
                   DEFINE find_centroid FindCentroid('$centroids');
                   raw = load 'student.txt' as (name:chararray, age:int, gpa:double);
                   centroided = foreach raw generate gpa, find_centroid(gpa) as centroid;
                   grouped = group centroided by centroid;
                   result = foreach grouped generate group, AVG(centroided.gpa);
                   store result into 'output';
                """)

converged = False
iter_num = 0
while iter_num<MAX_ITERATION:
    Q = P.bind({'centroids':initial_centroids})
    results = Q.runSingle()
if results.isSuccessful() == "FAILED":
        raise "Pig job failed"
    iter = results.result("result").iterator()
    centroids = [None] * k
    distance_move = 0
    # get new centroid of this iteration, caculate the moving distance with last iteration
    for i in range(k):
        tuple = iter.next()
        centroids[i] = float(str(tuple.get(1)))
        distance_move = distance_move + fabs(last_centroids[i]-centroids[i])
    distance_move = distance_move / k;
    Pig.fs("rmr output")
    print("iteration " + str(iter_num))
    print("average distance moved: " + str(distance_move))
    if distance_move<tolerance:
        sys.stdout.write("k-means converged at centroids: [")
        sys.stdout.write(",".join(str(v) for v in centroids))
        sys.stdout.write("]n")
        converged = True
        break
    last_centroids = centroids[:]
    initial_centroids = ""
    for i in range(k):
        initial_centroids = initial_centroids + str(last_centroids[i])
        if i!=k-1:
            initial_centroids = initial_centroids + ":"
    iter_num += 1

if not converged:
    print("not converge after " + str(iter_num) + " iterations")
    sys.stdout.write("last centroids: [")
    sys.stdout.write(",".join(str(v) for v in last_centroids))
    sys.stdout.write("]n")
import java.io.IOException;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;


public class FindCentroid extends EvalFunc<Double> {
    double[] centroids;
    public FindCentroid(String initialCentroid) {
        String[] centroidStrings = initialCentroid.split(":");
        centroids = new double[centroidStrings.length];
        for (int i=0;i<centroidStrings.length;i++)
            centroids[i] = Double.parseDouble(centroidStrings[i]);
    }
    @Override
    public Double exec(Tuple input) throws IOException {
        double min_distance = Double.MAX_VALUE;
        double closest_centroid = 0;
        for (double centroid : centroids) {
            double distance = Math.abs(centroid - (Double)input.get(0));
            if (distance < min_distance) {
                min_distance = distance;
                closest_centroid = centroid;
            }
        }
        return closest_centroid;
    }

}
mapreduce(mapreduce(…
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)
mapreduce(mapreduce(…

mapreduce(input = c(input1, input2), …)

equijoin = function(
    left.input, right.input, input,
    output,
    outer,
    map.left, map.right,
    reduce, reduce.all)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)
out1 = mapreduce(…)
mapreduce(input = out1, <xyz>)
mapreduce(input = out1, <abc>)

abstract.job = function(input, output, …) {
   …
   result = mapreduce(input = input,
                      output = output)
   …
   result}
input.format, output.format, format
input.format, output.format, format
combine
input.format, output.format, format
combine
reduce.on.data.frame
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
input.format, output.format, format
combine
reduce.on.data.frame
local, hadoop backends
backend.parameters
profiling
verbose
RHADOOP USER
ONE FAT CLUSTER AVE.
HYDROPOWER CITY, OR 0x0000




             RHADOOP@
      REVOLUTIONANALYTICS.COM

Weitere ähnliche Inhalte

Was ist angesagt?

Stata cheat sheet analysis
Stata cheat sheet analysisStata cheat sheet analysis
Stata cheat sheet analysisTim Essam
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptxNimrita Koul
 
Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on rAbhik Seal
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processingTim Essam
 
Building a Functional Stream in Scala
Building a Functional Stream in ScalaBuilding a Functional Stream in Scala
Building a Functional Stream in ScalaDerek Wyatt
 
ComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical SciencesComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical Sciencesalexstorer
 
R + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterR + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterJeffrey Breen
 
Phil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonPhil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonRoss McDonald
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query LanguageJulian Hyde
 
RHive tutorials - Basic functions
RHive tutorials - Basic functionsRHive tutorials - Basic functions
RHive tutorials - Basic functionsAiden Seonghak Hong
 
Python for R Users
Python for R UsersPython for R Users
Python for R UsersAjay Ohri
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformationTim Essam
 
Upgrading To The New Map Reduce API
Upgrading To The New Map Reduce APIUpgrading To The New Map Reduce API
Upgrading To The New Map Reduce APITom Croucher
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformationLaura Hughes
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat SheetLaura Hughes
 
Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章moai kids
 

Was ist angesagt? (20)

Stata cheat sheet analysis
Stata cheat sheet analysisStata cheat sheet analysis
Stata cheat sheet analysis
 
Hands on data science with r.pptx
Hands  on data science with r.pptxHands  on data science with r.pptx
Hands on data science with r.pptx
 
Data manipulation on r
Data manipulation on rData manipulation on r
Data manipulation on r
 
Polimorfismo cosa?
Polimorfismo cosa?Polimorfismo cosa?
Polimorfismo cosa?
 
Stata cheat sheet: data processing
Stata cheat sheet: data processingStata cheat sheet: data processing
Stata cheat sheet: data processing
 
Building a Functional Stream in Scala
Building a Functional Stream in ScalaBuilding a Functional Stream in Scala
Building a Functional Stream in Scala
 
ComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical SciencesComputeFest 2012: Intro To R for Physical Sciences
ComputeFest 2012: Intro To R for Physical Sciences
 
R + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop clusterR + 15 minutes = Hadoop cluster
R + 15 minutes = Hadoop cluster
 
Phil Bartie QGIS PLPython
Phil Bartie QGIS PLPythonPhil Bartie QGIS PLPython
Phil Bartie QGIS PLPython
 
Morel, a Functional Query Language
Morel, a Functional Query LanguageMorel, a Functional Query Language
Morel, a Functional Query Language
 
RHive tutorials - Basic functions
RHive tutorials - Basic functionsRHive tutorials - Basic functions
RHive tutorials - Basic functions
 
Python for R Users
Python for R UsersPython for R Users
Python for R Users
 
R seminar dplyr package
R seminar dplyr packageR seminar dplyr package
R seminar dplyr package
 
Stata cheat sheet: data transformation
Stata  cheat sheet: data transformationStata  cheat sheet: data transformation
Stata cheat sheet: data transformation
 
Upgrading To The New Map Reduce API
Upgrading To The New Map Reduce APIUpgrading To The New Map Reduce API
Upgrading To The New Map Reduce API
 
Stata cheatsheet transformation
Stata cheatsheet transformationStata cheatsheet transformation
Stata cheatsheet transformation
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
 
Dplyr and Plyr
Dplyr and PlyrDplyr and Plyr
Dplyr and Plyr
 
Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章Hadoop本 輪読会 1章〜2章
Hadoop本 輪読会 1章〜2章
 
Python for R users
Python for R usersPython for R users
Python for R users
 

Ähnlich wie RHadoop, R meets Hadoop

Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Craig Chao
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%David LeBauer
 
Monadologie
MonadologieMonadologie
Monadologieleague
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik ErlandsonDatabricks
 
Ruby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupRuby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupBrian O'Neill
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions Dr. Volkan OBAN
 
Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Filippo Vitale
 
support vector regression
support vector regressionsupport vector regression
support vector regressionAkhilesh Joshi
 
Introduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptIntroduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptWill Kurt
 
Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Brian O'Neill
 
Getting Functional with Scala
Getting Functional with ScalaGetting Functional with Scala
Getting Functional with ScalaJorge Paez
 

Ähnlich wie RHadoop, R meets Hadoop (20)

RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
Leveraging R in Big Data of Mobile Ads (R在行動廣告大數據的應用)
 
Jan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoopJan 2012 HUG: RHadoop
Jan 2012 HUG: RHadoop
 
Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%Aaron Ellison Keynote: Reaching the 99%
Aaron Ellison Keynote: Reaching the 99%
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Monadologie
MonadologieMonadologie
Monadologie
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
R code for data manipulation
R code for data manipulationR code for data manipulation
R code for data manipulation
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
 
Ruby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby GroupRuby on Big Data @ Philly Ruby Group
Ruby on Big Data @ Philly Ruby Group
 
Introduction to R
Introduction to RIntroduction to R
Introduction to R
 
ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015Will it Blend? - ScalaSyd February 2015
Will it Blend? - ScalaSyd February 2015
 
Clojure to Slang
Clojure to SlangClojure to Slang
Clojure to Slang
 
Hadoop I/O Analysis
Hadoop I/O AnalysisHadoop I/O Analysis
Hadoop I/O Analysis
 
support vector regression
support vector regressionsupport vector regression
support vector regression
 
Feature Extraction
Feature ExtractionFeature Extraction
Feature Extraction
 
Introduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScriptIntroduction to Functional Programming with Haskell and JavaScript
Introduction to Functional Programming with Haskell and JavaScript
 
Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)Ruby on Big Data (Cassandra + Hadoop)
Ruby on Big Data (Cassandra + Hadoop)
 
Getting Functional with Scala
Getting Functional with ScalaGetting Functional with Scala
Getting Functional with Scala
 

Mehr von Revolution Analytics

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudRevolution Analytics
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureRevolution Analytics
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudRevolution Analytics
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondRevolution Analytics
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source CommunitiesRevolution Analytics
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with RRevolution Analytics
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceRevolution Analytics
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudRevolution Analytics
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorRevolution Analytics
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalRevolution Analytics
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint packageRevolution Analytics
 

Mehr von Revolution Analytics (20)

Speeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the CloudSpeeding up R with Parallel Programming in the Cloud
Speeding up R with Parallel Programming in the Cloud
 
Migrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to AzureMigrating Existing Open Source Machine Learning to Azure
Migrating Existing Open Source Machine Learning to Azure
 
R in Minecraft
R in Minecraft R in Minecraft
R in Minecraft
 
The case for R for AI developers
The case for R for AI developersThe case for R for AI developers
The case for R for AI developers
 
Speed up R with parallel programming in the Cloud
Speed up R with parallel programming in the CloudSpeed up R with parallel programming in the Cloud
Speed up R with parallel programming in the Cloud
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R Then and Now
R Then and NowR Then and Now
R Then and Now
 
Predicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per SecondPredicting Loan Delinquency at One Million Transactions per Second
Predicting Loan Delinquency at One Million Transactions per Second
 
Reproducible Data Science with R
Reproducible Data Science with RReproducible Data Science with R
Reproducible Data Science with R
 
The Value of Open Source Communities
The Value of Open Source CommunitiesThe Value of Open Source Communities
The Value of Open Source Communities
 
The R Ecosystem
The R EcosystemThe R Ecosystem
The R Ecosystem
 
R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)R at Microsoft (useR! 2016)
R at Microsoft (useR! 2016)
 
Building a scalable data science platform with R
Building a scalable data science platform with RBuilding a scalable data science platform with R
Building a scalable data science platform with R
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 
The Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data ScienceThe Business Economics and Opportunity of Open Source Data Science
The Business Economics and Opportunity of Open Source Data Science
 
Taking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the CloudTaking R Analytics to SQL and the Cloud
Taking R Analytics to SQL and the Cloud
 
The Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductorThe Network structure of R packages on CRAN & BioConductor
The Network structure of R packages on CRAN & BioConductor
 
The network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 finalThe network structure of cran 2015 07-02 final
The network structure of cran 2015 07-02 final
 
Simple Reproducibility with the checkpoint package
Simple Reproducibilitywith the checkpoint packageSimple Reproducibilitywith the checkpoint package
Simple Reproducibility with the checkpoint package
 
R at Microsoft
R at MicrosoftR at Microsoft
R at Microsoft
 

Kürzlich hochgeladen

Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Miguel Araújo
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processorsdebabhi2
 
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...DianaGray10
 
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodPolkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodJuan lago vázquez
 
Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024SynarionITSolutions
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...apidays
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century educationjfdjdjcjdnsjd
 
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationFrom Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationSafe Software
 
A Year of the Servo Reboot: Where Are We Now?
A Year of the Servo Reboot: Where Are We Now?A Year of the Servo Reboot: Where Are We Now?
A Year of the Servo Reboot: Where Are We Now?Igalia
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024The Digital Insurer
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Drew Madelung
 
AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAndrey Devyatkin
 
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, AdobeApidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobeapidays
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfsudhanshuwaghmare1
 
Real Time Object Detection Using Open CV
Real Time Object Detection Using Open CVReal Time Object Detection Using Open CV
Real Time Object Detection Using Open CVKhem
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationRadu Cotescu
 
Manulife - Insurer Innovation Award 2024
Manulife - Insurer Innovation Award 2024Manulife - Insurer Innovation Award 2024
Manulife - Insurer Innovation Award 2024The Digital Insurer
 
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...Principled Technologies
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsJoaquim Jorge
 

Kürzlich hochgeladen (20)

Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processors
 
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
Connector Corner: Accelerate revenue generation using UiPath API-centric busi...
 
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin WoodPolkadot JAM Slides - Token2049 - By Dr. Gavin Wood
Polkadot JAM Slides - Token2049 - By Dr. Gavin Wood
 
Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century education
 
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time AutomationFrom Event to Action: Accelerate Your Decision Making with Real-Time Automation
From Event to Action: Accelerate Your Decision Making with Real-Time Automation
 
A Year of the Servo Reboot: Where Are We Now?
A Year of the Servo Reboot: Where Are We Now?A Year of the Servo Reboot: Where Are We Now?
A Year of the Servo Reboot: Where Are We Now?
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of Terraform
 
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, AdobeApidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
Real Time Object Detection Using Open CV
Real Time Object Detection Using Open CVReal Time Object Detection Using Open CV
Real Time Object Detection Using Open CV
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
Manulife - Insurer Innovation Award 2024
Manulife - Insurer Innovation Award 2024Manulife - Insurer Innovation Award 2024
Manulife - Insurer Innovation Award 2024
 
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 

RHadoop, R meets Hadoop

  • 2.
  • 3. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% 0% -12.5% -25% -37.5% R SAS SPSS S-Plus Stata
  • 4. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 1 2002 2004 2006 2008 2010
  • 5. Scholarly Activity 05-09 change 50% 37.5% 25% 12.5% Packages 0% 10000 -12.5% -25% 1000 -37.5% R SAS SPSS S-Plus Stata 100 10 http://r4stats.com/popularity 1 2002 2004 2006 2008 2010
  • 6.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14. f s h d r
  • 15. rh d f s rhb ase
  • 16. rh d f s rhb ase rm r
  • 17. rmr
  • 18.
  • 19.
  • 22. Rmr
  • 23. Rmr Java, C++
  • 24. Rmr Cascading, Java, C++ Crunch
  • 25. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 26. Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 27. Expose MR Hide MR Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 28. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Pydoop, Hadoopy Cascading, Java, C++ Crunch
  • 29. Expose MR Hide MR Hive, Pig Rmr, Rhipe, Dumbo, Rmr Cascalog, Pydoop, Hadoopy Scalding, Scrunch Cascading, Java, C++ Crunch
  • 36. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 37. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 38. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 39. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 40. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 41. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 42. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 43. map = function(k, v) if (hash(k) %% 10 == 0)keyval(k, v) reduce = function(k, vv) keyval(k, length(vv))
  • 45. condition = function(x) x > 10 out = mapreduce(
  • 46. condition = function(x) x > 10 out = mapreduce( input = input,
  • 47. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v)
  • 48. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 49. condition = function(x) x > 10 out = mapreduce( input = input, map = function(k,v) if (condition(v)) keyval(k,v))
  • 51. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender;
  • 52. INSERT OVERWRITE TABLE pv_gender_sum SELECT pv_users.gender, count (DISTINCT pv_users.userid) FROM pv_users GROUP BY pv_users.gender; mapreduce(input = mapreduce(input = "pv_users", map = function(k, v) keyval(v['userid'], v['gender']), reduce = function(uid, genders) lapply(unique(genders), function(g) keyval(NULL, g)), output = "pv_gender_sum", map = function(x, gender) keyval(gender, 1) reduce = function(gender,counts) keyval(k,sum(unlist(counts)))
  • 53. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters}
  • 54. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 55. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 56. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 57. kmeans = function(points, ncenters, iterations = 10, distfun = function(a,b) norm(as.matrix(a-b), type = 'F')) { newCenters = kmeans.iter(points, distfun, ncenters = ncenters) for(i in 1:iterations) { newCenters = kmeans.iter(points, distfun, centers = newCenters)} newCenters} kmeans.iter = function(points, distfun, ncenters = dim(centers)[1], centers = NULL) { from.dfs( mapreduce( input = points, map = if (is.null(centers)) { function(k,v) keyval(sample(1:ncenters,1),v)} else { function(k,v) { distances = apply(centers, 1, function(c) distfun(c,v)) keyval(centers[which.min(distances),], v)}}, reduce = function(k,vv) keyval(NULL, apply(do.call(rbind, vv), 2, mean))), to.data.frame = T)}
  • 58. #!/usr/bin/python import sys from math import fabs from org.apache.pig.scripting import Pig filename = "student.txt" k = 4 tolerance = 0.01 MAX_SCORE = 4 MIN_SCORE = 0 MAX_ITERATION = 100 # initial centroid, equally divide the space initial_centroids = "" last_centroids = [None] * k for i in range(k): last_centroids[i] = MIN_SCORE + float(i)/k*(MAX_SCORE-MIN_SCORE) initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" P = Pig.compile("""register udf.jar DEFINE find_centroid FindCentroid('$centroids'); raw = load 'student.txt' as (name:chararray, age:int, gpa:double); centroided = foreach raw generate gpa, find_centroid(gpa) as centroid; grouped = group centroided by centroid; result = foreach grouped generate group, AVG(centroided.gpa); store result into 'output'; """) converged = False iter_num = 0 while iter_num<MAX_ITERATION: Q = P.bind({'centroids':initial_centroids}) results = Q.runSingle()
  • 59. if results.isSuccessful() == "FAILED": raise "Pig job failed" iter = results.result("result").iterator() centroids = [None] * k distance_move = 0 # get new centroid of this iteration, caculate the moving distance with last iteration for i in range(k): tuple = iter.next() centroids[i] = float(str(tuple.get(1))) distance_move = distance_move + fabs(last_centroids[i]-centroids[i]) distance_move = distance_move / k; Pig.fs("rmr output") print("iteration " + str(iter_num)) print("average distance moved: " + str(distance_move)) if distance_move<tolerance: sys.stdout.write("k-means converged at centroids: [") sys.stdout.write(",".join(str(v) for v in centroids)) sys.stdout.write("]n") converged = True break last_centroids = centroids[:] initial_centroids = "" for i in range(k): initial_centroids = initial_centroids + str(last_centroids[i]) if i!=k-1: initial_centroids = initial_centroids + ":" iter_num += 1 if not converged: print("not converge after " + str(iter_num) + " iterations") sys.stdout.write("last centroids: [") sys.stdout.write(",".join(str(v) for v in last_centroids)) sys.stdout.write("]n")
  • 60. import java.io.IOException; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; public class FindCentroid extends EvalFunc<Double> { double[] centroids; public FindCentroid(String initialCentroid) { String[] centroidStrings = initialCentroid.split(":"); centroids = new double[centroidStrings.length]; for (int i=0;i<centroidStrings.length;i++) centroids[i] = Double.parseDouble(centroidStrings[i]); } @Override public Double exec(Tuple input) throws IOException { double min_distance = Double.MAX_VALUE; double closest_centroid = 0; for (double centroid : centroids) { double distance = Math.abs(centroid - (Double)input.get(0)); if (distance < min_distance) { min_distance = distance; closest_centroid = centroid; } } return closest_centroid; } }
  • 63. mapreduce(mapreduce(… mapreduce(input = c(input1, input2), …) equijoin = function( left.input, right.input, input, output, outer, map.left, map.right, reduce, reduce.all)
  • 64. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>)
  • 65. out1 = mapreduce(…) mapreduce(input = out1, <xyz>) mapreduce(input = out1, <abc>) abstract.job = function(input, output, …) { … result = mapreduce(input = input, output = output) … result}
  • 71. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling
  • 72. input.format, output.format, format combine reduce.on.data.frame local, hadoop backends backend.parameters profiling verbose
  • 73.
  • 74.
  • 75.
  • 76. RHADOOP USER ONE FAT CLUSTER AVE. HYDROPOWER CITY, OR 0x0000 RHADOOP@ REVOLUTIONANALYTICS.COM

Hinweis der Redaktion

  1. What is R\nWhat is RHadoop\nOpen source project\nstarted by RevoLution\naims to make R and Hadoop work together\nwhat is revolution\n
  2. \n
  3. \n
  4. \n
  5. Faster, assured builds\nLarge Data extensions\nWeb deployments\nTech support\nConsulting service\nTraining\n
  6. \n
  7. hadoop bring horizontal scalability\nr sophisticated analytics\ncombination could be powerful\n
  8. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  9. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  10. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  11. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  12. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  13. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  14. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  15. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  16. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  17. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  18. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  19. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  20. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  21. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  22. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  23. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  24. Hadoop is one project but also a family of projects. We started the integration path with three projects targeting three members of the Hadoop family\n\nHadoop hdfs provides acces to hdfs file system. can be divided into two sub-APis: file level and byte level\n
  25. A way to access big data sets\n\n
  26. A simple way to write parallel programs &amp;#x2013; everyone will have to\n \n\n
  27. Very R-like, building on the functional characteristics of R\n\n
  28. Just a library&amp;#xA0;\n
  29. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  30. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  31. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  32. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  33. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  34. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  35. \n Much simpler than writing Java\n Not as simple as Hive, Pig at what they do, but more general\n Great for prototyping, can transition to production -- optimize instead of rewriting! Lower risk, always executable.\n
  36. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  37. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  38. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  39. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  40. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  41. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  42. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  43. mapreduce first an most important element of API\ninput can be as simple as a path\noutput the same or skip for managed space with stubs\nmap reduce simple R functions as opposed to Rhipe\n
  44. simple map example -- filtering\nreduce example -- counting\n
  45. simple map example -- filtering\nreduce example -- counting\n
  46. simple map example -- filtering\nreduce example -- counting\n
  47. simple map example -- filtering\nreduce example -- counting\n
  48. simple map example -- filtering\nreduce example -- counting\n
  49. simple map example -- filtering\nreduce example -- counting\n
  50. simple map example -- filtering\nreduce example -- counting\n
  51. simple map example -- filtering\nreduce example -- counting\n
  52. simple map example -- filtering\nreduce example -- counting\n
  53. simple map example -- filtering\nreduce example -- counting\n
  54. simple map example -- filtering\nreduce example -- counting\n
  55. simple map example -- filtering\nreduce example -- counting\n
  56. simple map example -- filtering\nreduce example -- counting\n
  57. easy to parametrize jobs\n
  58. easy to parametrize jobs\n
  59. easy to parametrize jobs\n
  60. easy to parametrize jobs\n
  61. easy to parametrize jobs\n
  62. easy to parametrize jobs\n
  63. second pillar of API, the memory-hdfs bridge\n
  64. A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  65. A language like HIVE makes a class of problems easy to solve, but it is not a general tool\n The cost of doing the same operation in rmr is modest and it provides a broader set of capabilities\n
  66. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  67. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  68. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  69. kmeans implementation in two simple functions\nnote how easy it is to get data in and out of the cluster\n
  70. skip quickly to other slides\nnotice three different languages\n
  71. \n
  72. \n
  73. more things you can do combining the elements of the API\n
  74. more things you can do combining the elements of the API\n
  75. more things you can do combining the elements of the API\n
  76. \n
  77. \n
  78. \n
  79. \n
  80. \n
  81. \n
  82. \n
  83. \n
  84. \n
  85. \n
  86. \n
  87. \n
  88. \n