SlideShare ist ein Scribd-Unternehmen logo
1 von 25
Downloaden Sie, um offline zu lesen
R      MapReduce

    @holidayworking


    2010   8   28
)
        Twitter: @holidayworking
        :
        :
        :                   F1
        :
        Java, PL/SQL:
        Python, Ruby, R:




@holidayworking ()          R    MapReduce   2010   8   28   2 / 18
MapReduce



   Google

   map          reduce   2




  @holidayworking ()         R   MapReduce   2010   8   28   3 / 18
MapReduce


 1   Map


 2   Shuffle

 3   Reduce




     @holidayworking ()   R   MapReduce   2010   8   28   4 / 18
MapReduce




        [1]


  @holidayworking ()   R   MapReduce   2010   8   28   5 / 18
MapReduce




           Grep




  @holidayworking ()   R   MapReduce   2010   8   28   6 / 18
Hadoop


   Google File System   MapReduce




  @holidayworking ()     R   MapReduce   2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java




  @holidayworking ()     R   MapReduce   2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                      Java




  @holidayworking ()     R   MapReduce      2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                      Java

   Hadoop Streaming




  @holidayworking ()     R   MapReduce      2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                            Java

   Hadoop Streaming
                                         MapReduce




  @holidayworking ()     R   MapReduce               2010   8   28   7 / 18
Hadoop


   Google File System   MapReduce
   Hadoop      Java
          MapReduce                            Java

   Hadoop Streaming
                                         MapReduce

          R




  @holidayworking ()     R   MapReduce               2010   8   28   7 / 18
R       MapReduce




                    Ardbeg 10 Years Old
                    Bowmore 12 Years Old
                    Talisker 10 Years Old
                    The Glenlivet 12 Year Old
                    The Macallan 12 Years

                    Ballantine 12 Years Old
                    Ballantine 17 Years Old
                    Johnnie Walker Gold Label 18 Years Old
                    Johnnie Walker Swing



    @holidayworking ()               R   MapReduce           2010   8   28   8 / 18
iWork           Numbers
                  250

 2010/07/01          The Macallan 12 Years                    single malt       10
 2010/07/01          Ballantine 12 Years Old                    blended         3
 2010/07/01          Ballantine 17 Years Old                    blended         6
 2010/07/01          Johnnie Walker Gold Label 18 Years Old     blended         6
 2010/07/02          The Glenlivet 12 Year Old                single malt       4
 2010/07/02          Ardbeg 10 Years Old                      single malt       2
 2010/07/02          Ballantine 12 Years Old                    blended         8
 2010/07/02          Ballantine 17 Years Old                    blended         7
 2010/07/02          Johnnie Walker Swing                       blended         3
                                           (   )
 2010/07/31          Johnnie Walker Swing                       blended         4
 2010/07/31          Johnnie Walker Gold Label 18 Years Old     blended         2
 2010/07/31          Bowmore 12 Years Old                     single malt       4
 2010/07/31          Talisker 10 Years Old                    single malt       7



@holidayworking ()                    R   MapReduce                  2010   8   28   9 / 18
@holidayworking ()   R   MapReduce   2010   8   28   10 / 18
MapReduce

 1   Mapper
 2   Reducer
 3   Hadoop Streaming
     $ hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-0.20.2-streaming.jar 
        -input scotch.tsv 
        -output output 
        -mapper mapper.r 
        -reducer reducer.r


 4


     $ cat output/part-00000
     blended 592
     single malt 783




     @holidayworking ()           R   MapReduce             2010   8   28   11 / 18
Reducer
#!/usr/bin/env Rscript

env <- new.env(hash = TRUE)
con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  key <- line[1]
  value <- as.integer(line[2])
  if (exists(key, envir = env, inherits = FALSE)) {
    oldcount <- get(key, envir = env)
    assign(key, oldcount + value, envir = env)
  } else {
    assign(key, value, envir = env)
  }
}
close(con)

for (key in ls(env, all = TRUE)) {
  cat(key, "t", get(value, envir = env), "n", sep = " ")
}



     @holidayworking ()           R   MapReduce              2010   8   28   12 / 18
Mapper
#!/usr/bin/env Rscript

con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  date <- line[1]
  order <- line[4]
  cat(sprintf("%st%sn", date, order), sep = "")
}
close(con)



cat output/part-00000
2010/07/01 25
2010/07/02 42
2010/07/03 39

2010/07/29 17
2010/07/30 45
2010/07/31 47

@holidayworking ()            R   MapReduce            2010   8   28   13 / 18
Mapper
#!/usr/bin/env Rscript

con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  brand <- line[2]
  order <- line[4]
  cat(sprintf("%st%sn", brand, order), sep = "")
}
close(con)



$ cat output/part-00000
Ardbeg 10 Years Old 166
Ballantine 12 Years Old 142
Ballantine 17 Years Old 150
Bowmore 12 Years Old 149
Johnnie Walker Gold Label 18 Years Old 176
Johnnie Walker Swing 124
Talisker 10 Years Old 176
The Glenlivet 12 Year Old 164
The Macallan 12 Years 128
@holidayworking ()           R    MapReduce            2010   8   28   14 / 18
Mapper
#!/usr/bin/env Rscript

con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
  line <- unlist(strsplit(line, "t"))
  type <- line[3]
  order <- line[4]
  cat(sprintf("%st%sn", type, order), sep = "")
}
close(con)




$ cat output/part-00000
blended 592
single malt 783




@holidayworking ()           R   MapReduce             2010   8   28   15 / 18
MapReduce :




@holidayworking ()   R   MapReduce   2010   8   28   16 / 18
MapReduce :

Hadoop : Google File System       MapReduce




@holidayworking ()     R   MapReduce          2010   8   28   16 / 18
MapReduce :

Hadoop : Google File System        MapReduce

Hadoop Streaming               R       MapReduce




@holidayworking ()     R   MapReduce           2010   8   28   16 / 18
@holidayworking ()   R   MapReduce   2010   8   28   17 / 18
Jeffrey Dean and Sanjay Ghemawat.
Mapreduce: Simplified data processing on large clusters.
OSDI’04: Sixth Symposium on Operating System Design and Implementation, 2004.
Tom White.
Hadoop.
                     .




@holidayworking ()              R   MapReduce                2010   8   28      18 / 18

Weitere ähnliche Inhalte

Andere mochten auch

Andere mochten auch (17)

機械の体を手に入れるのよ、 鉄郎!!!
機械の体を手に入れるのよ、鉄郎!!!機械の体を手に入れるのよ、鉄郎!!!
機械の体を手に入れるのよ、 鉄郎!!!
 
A Chainer MeetUp Talk
A Chainer MeetUp TalkA Chainer MeetUp Talk
A Chainer MeetUp Talk
 
Chainer meetup20151014
Chainer meetup20151014Chainer meetup20151014
Chainer meetup20151014
 
Chainer Meetup LT (Alpaca)
Chainer Meetup LT (Alpaca)Chainer Meetup LT (Alpaca)
Chainer Meetup LT (Alpaca)
 
Chainer meetup
Chainer meetupChainer meetup
Chainer meetup
 
LT@Chainer Meetup
LT@Chainer MeetupLT@Chainer Meetup
LT@Chainer Meetup
 
Learning stochastic neural networks with Chainer
Learning stochastic neural networks with ChainerLearning stochastic neural networks with Chainer
Learning stochastic neural networks with Chainer
 
Towards Chainer v1.5
Towards Chainer v1.5Towards Chainer v1.5
Towards Chainer v1.5
 
Introduction to DEEPstation the GUI Deep learning environment for chainer
Introduction to DEEPstation the GUI Deep learning environment for chainerIntroduction to DEEPstation the GUI Deep learning environment for chainer
Introduction to DEEPstation the GUI Deep learning environment for chainer
 
40分でわかるHadoop徹底入門 (Cloudera World Tokyo 2014 講演資料)
40分でわかるHadoop徹底入門 (Cloudera World Tokyo 2014 講演資料) 40分でわかるHadoop徹底入門 (Cloudera World Tokyo 2014 講演資料)
40分でわかるHadoop徹底入門 (Cloudera World Tokyo 2014 講演資料)
 
TensorFlow を使った 機械学習ことはじめ (GDG京都 機械学習勉強会)
TensorFlow を使った機械学習ことはじめ (GDG京都 機械学習勉強会)TensorFlow を使った機械学習ことはじめ (GDG京都 機械学習勉強会)
TensorFlow を使った 機械学習ことはじめ (GDG京都 機械学習勉強会)
 
Rの高速化
Rの高速化Rの高速化
Rの高速化
 
深層学習フレームワーク Chainer の開発と今後の展開
深層学習フレームワーク Chainer の開発と今後の展開深層学習フレームワーク Chainer の開発と今後の展開
深層学習フレームワーク Chainer の開発と今後の展開
 
Chainer Update v1.8.0 -> v1.10.0+
Chainer Update v1.8.0 -> v1.10.0+Chainer Update v1.8.0 -> v1.10.0+
Chainer Update v1.8.0 -> v1.10.0+
 
Chainer v2 alpha
Chainer v2 alphaChainer v2 alpha
Chainer v2 alpha
 
RでGPU使ってみた
RでGPU使ってみたRでGPU使ってみた
RでGPU使ってみた
 
TensorFlowで会話AIを作ってみた。
TensorFlowで会話AIを作ってみた。TensorFlowで会話AIを作ってみた。
TensorFlowで会話AIを作ってみた。
 

Mehr von Hidekazu Tanaka (11)

ggplot2 に入門してみた
ggplot2 に入門してみたggplot2 に入門してみた
ggplot2 に入門してみた
 
データベースのお話
データベースのお話データベースのお話
データベースのお話
 
フォントのお話
フォントのお話フォントのお話
フォントのお話
 
フォントのお話
フォントのお話フォントのお話
フォントのお話
 
バギングで構築された各決定木
バギングで構築された各決定木バギングで構築された各決定木
バギングで構築された各決定木
 
アンサンブル学習
アンサンブル学習アンサンブル学習
アンサンブル学習
 
RHadoop の紹介
RHadoop の紹介RHadoop の紹介
RHadoop の紹介
 
Rの紹介
Rの紹介Rの紹介
Rの紹介
 
Rで解く最適化問題 線型計画問題編
Rで解く最適化問題   線型計画問題編 Rで解く最適化問題   線型計画問題編
Rで解く最適化問題 線型計画問題編
 
R meets Hadoop
R meets HadoopR meets Hadoop
R meets Hadoop
 
Rによるやさしい統計学 第16章 : 因子分析
Rによるやさしい統計学 第16章 : 因子分析Rによるやさしい統計学 第16章 : 因子分析
Rによるやさしい統計学 第16章 : 因子分析
 

RでMapreduce

  • 1. R MapReduce @holidayworking 2010 8 28
  • 2. ) Twitter: @holidayworking : : : F1 : Java, PL/SQL: Python, Ruby, R: @holidayworking () R MapReduce 2010 8 28 2 / 18
  • 3. MapReduce Google map reduce 2 @holidayworking () R MapReduce 2010 8 28 3 / 18
  • 4. MapReduce 1 Map 2 Shuffle 3 Reduce @holidayworking () R MapReduce 2010 8 28 4 / 18
  • 5. MapReduce [1] @holidayworking () R MapReduce 2010 8 28 5 / 18
  • 6. MapReduce Grep @holidayworking () R MapReduce 2010 8 28 6 / 18
  • 7. Hadoop Google File System MapReduce @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 8. Hadoop Google File System MapReduce Hadoop Java @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 9. Hadoop Google File System MapReduce Hadoop Java MapReduce Java @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 10. Hadoop Google File System MapReduce Hadoop Java MapReduce Java Hadoop Streaming @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 11. Hadoop Google File System MapReduce Hadoop Java MapReduce Java Hadoop Streaming MapReduce @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 12. Hadoop Google File System MapReduce Hadoop Java MapReduce Java Hadoop Streaming MapReduce R @holidayworking () R MapReduce 2010 8 28 7 / 18
  • 13. R MapReduce Ardbeg 10 Years Old Bowmore 12 Years Old Talisker 10 Years Old The Glenlivet 12 Year Old The Macallan 12 Years Ballantine 12 Years Old Ballantine 17 Years Old Johnnie Walker Gold Label 18 Years Old Johnnie Walker Swing @holidayworking () R MapReduce 2010 8 28 8 / 18
  • 14. iWork Numbers 250 2010/07/01 The Macallan 12 Years single malt 10 2010/07/01 Ballantine 12 Years Old blended 3 2010/07/01 Ballantine 17 Years Old blended 6 2010/07/01 Johnnie Walker Gold Label 18 Years Old blended 6 2010/07/02 The Glenlivet 12 Year Old single malt 4 2010/07/02 Ardbeg 10 Years Old single malt 2 2010/07/02 Ballantine 12 Years Old blended 8 2010/07/02 Ballantine 17 Years Old blended 7 2010/07/02 Johnnie Walker Swing blended 3 ( ) 2010/07/31 Johnnie Walker Swing blended 4 2010/07/31 Johnnie Walker Gold Label 18 Years Old blended 2 2010/07/31 Bowmore 12 Years Old single malt 4 2010/07/31 Talisker 10 Years Old single malt 7 @holidayworking () R MapReduce 2010 8 28 9 / 18
  • 15. @holidayworking () R MapReduce 2010 8 28 10 / 18
  • 16. MapReduce 1 Mapper 2 Reducer 3 Hadoop Streaming $ hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-0.20.2-streaming.jar -input scotch.tsv -output output -mapper mapper.r -reducer reducer.r 4 $ cat output/part-00000 blended 592 single malt 783 @holidayworking () R MapReduce 2010 8 28 11 / 18
  • 17. Reducer #!/usr/bin/env Rscript env <- new.env(hash = TRUE) con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) key <- line[1] value <- as.integer(line[2]) if (exists(key, envir = env, inherits = FALSE)) { oldcount <- get(key, envir = env) assign(key, oldcount + value, envir = env) } else { assign(key, value, envir = env) } } close(con) for (key in ls(env, all = TRUE)) { cat(key, "t", get(value, envir = env), "n", sep = " ") } @holidayworking () R MapReduce 2010 8 28 12 / 18
  • 18. Mapper #!/usr/bin/env Rscript con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) date <- line[1] order <- line[4] cat(sprintf("%st%sn", date, order), sep = "") } close(con) cat output/part-00000 2010/07/01 25 2010/07/02 42 2010/07/03 39 2010/07/29 17 2010/07/30 45 2010/07/31 47 @holidayworking () R MapReduce 2010 8 28 13 / 18
  • 19. Mapper #!/usr/bin/env Rscript con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) brand <- line[2] order <- line[4] cat(sprintf("%st%sn", brand, order), sep = "") } close(con) $ cat output/part-00000 Ardbeg 10 Years Old 166 Ballantine 12 Years Old 142 Ballantine 17 Years Old 150 Bowmore 12 Years Old 149 Johnnie Walker Gold Label 18 Years Old 176 Johnnie Walker Swing 124 Talisker 10 Years Old 176 The Glenlivet 12 Year Old 164 The Macallan 12 Years 128 @holidayworking () R MapReduce 2010 8 28 14 / 18
  • 20. Mapper #!/usr/bin/env Rscript con <- file("stdin", open = "r") while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) { line <- unlist(strsplit(line, "t")) type <- line[3] order <- line[4] cat(sprintf("%st%sn", type, order), sep = "") } close(con) $ cat output/part-00000 blended 592 single malt 783 @holidayworking () R MapReduce 2010 8 28 15 / 18
  • 21. MapReduce : @holidayworking () R MapReduce 2010 8 28 16 / 18
  • 22. MapReduce : Hadoop : Google File System MapReduce @holidayworking () R MapReduce 2010 8 28 16 / 18
  • 23. MapReduce : Hadoop : Google File System MapReduce Hadoop Streaming R MapReduce @holidayworking () R MapReduce 2010 8 28 16 / 18
  • 24. @holidayworking () R MapReduce 2010 8 28 17 / 18
  • 25. Jeffrey Dean and Sanjay Ghemawat. Mapreduce: Simplified data processing on large clusters. OSDI’04: Sixth Symposium on Operating System Design and Implementation, 2004. Tom White. Hadoop. . @holidayworking () R MapReduce 2010 8 28 18 / 18