SlideShare a Scribd company logo
1 of 193
Download to read offline
Scalding
Hadoop Word Count
in < 70 lines of code




                  Konrad 'ktoso' Malawski
                 JARCamp #3 12.04.2013
Scalding
Hadoop Word Count

 in   4 lines of code


                   Konrad 'ktoso' Malawski
                  JARCamp #3 12.04.2013
softwaremill.com / java.pl / sckrk.com / geecon.org / krakowscala.pl / gdgkrakow.pl
Agenda
Agenda
Why Scalding? (10%)
Agenda
Why Scalding? (10%)
       +
Agenda
Why Scalding? (10%)
       +
Hadoop Basics (20%)
Agenda
Why Scalding? (10%)
       +
Hadoop Basics (20%)
       +
Agenda
 Why Scalding? (10%)
          +
 Hadoop Basics (20%)
          +
Enter Cascading (40%)
Agenda
 Why Scalding? (10%)
          +
 Hadoop Basics (20%)
          +
Enter Cascading (40%)
          +
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
           =
Agenda
 Why Scalding? (10%)
           +
 Hadoop Basics (20%)
           +
Enter Cascading (40%)
           +
 Hello Scalding (30%)
           =
         100%
Why Scalding?
 Word Count in Types


type Word = String
type Count = Int

String => Map[Word, Count]
Why Scalding?
 Word Count in Scala
Why Scalding?
                Word Count in Scala

val text = "a a a b b"
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map { a => a._1 -> a._2.map(_._2).sum }
Why Scalding?
                Word Count in Scala

val text = "a a a b b"

def wordCount(text: String): Map[Word, Count] =
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map { a => a._1 -> a._2.map(_._2).sum }



wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))
Stuff > Memory
Scala collections... fun but, memory bound!


val text = "so many words... waaah! ..."


  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."


  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."

                         in Memory
  text
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                            in Memory
val text = "so many words... waaah! ..."

                         in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                               in Memory
val text = "so many words... waaah! ..."

                           in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))              in Memory
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))
Stuff > Memory
Scala collections... fun but, memory bound!

                                               in Memory
val text = "so many words... waaah! ..."

                           in Memory
  text
                                in Memory
    .split(" ")
    .map(a => (a, 1))              in Memory
    .groupBy(_._1)
    .map(a => (a._1, a._2.map(_._2).sum))

                                       in Memory
Apache Hadoop (HDFS + MR)
    http://hadoop.apache.org/
Why Scalding?
                             Word Count in Hadoop MR



package org.myorg;

import   org.apache.hadoop.fs.Path;
import   org.apache.hadoop.io.IntWritable;
import   org.apache.hadoop.io.LongWritable;
import   org.apache.hadoop.io.Text;
import   org.apache.hadoop.mapred.*;

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

public class WordCount {

    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                output.collect(word, one);
private final static IntWritable one = new IntWritable(1);




                              Why Scalding?
        private Text word = new Text();

        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro
IOException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());

                           Word Count in Hadoop MR
                output.collect(word, one);
            }
        }
    }

    public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
        public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter
reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            output.collect(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception {
        JobConf conf = new JobConf(WordCount.class);
        conf.setJobName("wordcount");

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        conf.setMapperClass(Map.class);
        conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        JobClient.runJob(conf);
    }
}
Trivia: How old is Hadoop?
Cascading
www.cascading.org/
Cascading
www.cascading.org/
Cascading
    is
Cascading
     is
Taps & Pipes
Cascading
     is
Taps & Pipes



        & Sinks
1: Distributed Copy
1: Distributed Copy
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource( copyPipe, inTap )
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource(copyPipe, inTap)
  .addTailSink(copyPipe, outTap);
1: Distributed Copy

// source Tap
Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);

// sink Tap
Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

// a Pipe, connects taps
Pipe copyPipe = new Pipe("copy");

// build the Flow
FlowDef flowDef = FlowDef.flowDef()
  .addSource(copyPipe, inTap)
  .addTailSink(copyPipe, outTap);

// run!
flowConnector.connect(flowDef).complete();
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
1. DCP - Full Code
public class Main {
public static void main(String[] args ) {
  String inPath = args[0];     String outPath = args[1];

    Properties props = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(props);

    Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath);

    Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);

    Pipe copyPipe = new Pipe("copy");

    FlowDef flowDef = FlowDef.flowDef()
     .addSource(copyPipe, inTap)
     .addTailSink(copyPipe, outTap);

      flowConnector.connect(flowDef).complete();
}
}
2: Word Count



String docPath = args[ 0 ];
String wcPath = args[ 1 ];

Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
2: Word Count


  String docPath = args[ 0 ];
  String wcPath = args[ 1 ];

  Properties properties = new Properties();
  AppProps.setApplicationJarClass( props, Main.class );
  HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

 // create source and sink taps
 Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
 Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

  // specify a regex operation to split the "document" text lines into a
ken stream
2: Word Count

  String docPath = args[ 0 ];
  String wcPath = args[ 1 ];

  Properties properties = new Properties();
  AppProps.setApplicationJarClass( props, Main.class );
  HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

 // create source and sink taps
 Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
 Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

  // specify a regex operation to split the "document" text lines into a
ken stream
  Fields token = new Fields( "token" );
  Fields text = new Fields( "text" );
  RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [
),.]" );
2: Word Count
String docPath = args[ 0 ];
String wcPath = args[ 1 ];

Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
String wcPath = args[ 1 ];


            2: Word Count
            2: Word Count
Properties properties = new Properties();
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
AppProps.setApplicationJarClass( props, Main.class );
HadoopFlowConnector flowConnector = new HadoopFlowConnector( props );

            2: Word Count
// create source and sink taps
Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath );
Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );

Fields token = new Fields( "token" );
Fields text = new Fields( "text" );
RegexSplitGenerator splitter =
            new RegexSplitGenerator( token, "[ [](),.]" );

// only returns "token"
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

// determine the word counts
Pipe wcPipe = new Pipe( "wc", docPipe );
wcPipe = new GroupBy( wcPipe, token );
wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef()
 .setName( "wc" )
 .addSource( docPipe, docTap )
 .addTailSink( wcPipe, wcTap );

// write a DOT file and run the flow
Fields token = new Fields( "token" );


                2: Word Count
    Fields text = new Fields( "text" );
    RegexSplitGenerator splitter =
                new RegexSplitGenerator( token, "[ [](),.]" );

    // only returns "token"
    Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );

    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
}
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );


                2: Word Count
                How it's made
    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
}
Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS );


                2: Word Count
                How it's made
    // determine the word counts
    Pipe wcPipe = new Pipe( "wc", docPipe );
    wcPipe = new GroupBy( wcPipe, token );
    wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );

    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef()
     .setName( "wc" )
     .addSource( docPipe, docTap )
     .addTailSink( wcPipe, wcTap );

    // write a DOT file and run the flow
    Flow wcFlow = flowConnector.connect( flowDef );
    wcFlow.writeDOT( "dot/wc.dot" );
    wcFlow.complete();
    }
                    Graph representation of jobs!
}
2: Word Count
How it's made




http://www.cascading.org/2012/07/09/cascading-for-the-impatient-part-2/
How it's made
How it's made
val flow = FlowDef
How it's made
val flow = FlowDef

// pseudo code...
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
HadoopCluster.execute(jobs)
How it's made
val flow = FlowDef

// pseudo code...
val jobs: List[MRJob] = flowConnector(flow)

// pseudo code...
HadoopCluster.execute(jobs)
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );


flowDef.setDebugLevel( DebugLevel.NONE );
Cascading tips
Pipe assembly = new Pipe( "assembly" );
assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() );
// ...

// head and tail have same name
FlowDef flowDef = new FlowDef()
  .setName( "debug" )
  .addSource( "assembly", source )
  .addSink( "assembly", sink )
  .addTail( assembly );


flowDef.setDebugLevel( DebugLevel.NONE );

                     flowConnector will NOT create the Debug pipe!
Scalding
     =
     +


   Twitter Scalding
github.com/twitter/scalding
Scalding API
map
map
Scala:
val data = 1 :: 2 :: 3 :: Nil
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }

                                   // Int => Int
map
Scala:
val data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }

                                   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                     // Int => Int


Scalding:
  IterableSource(data)
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                        // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                        // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                                         // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                             // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                         available in Pipe   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                             // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


     stays in Pipe       available in Pipe   // Int => Int
map
 Scala:
  val data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }

                                          // Int => Int


Scalding:
  IterableSource(data)
    .map('number -> 'doubled) { n: Int => n * 2 }


                      must choose type!   // Int => Int
mapTo
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
         release reference
mapTo
Scala:
var data = 1 :: 2 :: 3 :: Nil

val doubled = data map { _ * 2 }
data = null
                                   // Int => Int
         release reference
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                     // Int => Int
            release reference

Scalding:
  IterableSource(data)
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                           // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                           // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


                                           // Int => Int
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                                    // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


                            doubled stays in Pipe   // Int => Int
mapTo
 Scala:
  var data = 1 :: 2 :: 3 :: Nil

  val doubled = data map { _ * 2 }
  data = null
                                                    // Int => Int
            release reference

Scalding:
  IterableSource(data)
    .mapTo('doubled) { n: Int => n * 2 }


  number is removed         doubled stays in Pipe   // Int => Int
flatMap
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",")                      // Array[String]
} map { _.toInt }                      // List[Int]

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
    .map('word -> 'number) { _.toInt }           // like List[Int]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",")                      // Array[String]
 } map { _.toInt }                      // List[Int]

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
    .flatMap('line -> 'word) { _.split(",") }    // like List[String]
    .map('word -> 'number) { _.toInt }           // like List[Int]

                     MR map outside
flatMap
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
Scala:
val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

val numbers = data flatMap { line =>   // String
  line.split(",").map(_.toInt)         // Array[Int]
}

numbers             // List[Int]
numbers should equal (List(1, 2, 2, 3, 3, 3))
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                                 // like List[String]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
                                                  // like List[Int]
flatMap
 Scala:
 val data = "1" :: "2,2" :: "3,3,3" :: Nil   // List[String]

 val numbers = data flatMap { line =>   // String
   line.split(",").map(_.toInt)         // Array[Int]
 }

 numbers             // List[Int]
 numbers should equal (List(1, 2, 2, 3, 3, 3))



Scalding:
  TextLine(data)                               // like List[String]
    .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
                                                  // like List[Int]
                          map inside Scala
groupBy
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil     // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
groupBy
Scala:
val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

val groups = data groupBy { _ < 10 }

groups         // Map[Boolean, Int]

groups(true) should equal (List(1, 2))
groups(false) should equal (List(30, 42))
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil   // List[Int]

 val groups = data groupBy { _ < 10 }

 groups         // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil         // List[Int]

 val groups = data groupBy { _ < 10 }

 groups                // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }

 groups all with == value
groupBy
 Scala:
 val data = 1 :: 2 :: 30 :: 42 :: Nil         // List[Int]

 val groups = data groupBy { _ < 10 }

 groups                // Map[Boolean, Int]

 groups(true) should equal (List(1, 2))
 groups(false) should equal (List(30, 42))



Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.size('size) }

 groups all with == value                     => 'size
groupBy


Scalding:
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.sum('total) }
groupBy


Scalding:
 IterableSource(List(1, 2, 30, 42), 'num)
     .map('num -> 'lessThanTen) { i: Int => i < 10 }
     .groupBy('lessThanTen) { _.sum('total) }

                              'total = [3, 74]
Scalding API
Scalding API
  project / discard
Scalding API
  project / discard
    map / mapTo
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
       filter
Scalding API
  project / discard
    map / mapTo
 flatMap / flatMapTo
      rename
        filter
       unique
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug

          Group operations
Scalding API
             project / discard
               map / mapTo
            flatMap / flatMapTo
                 rename
                   filter
                  unique
groupBy / groupAll / groupRandom / shuffle
                   limit
                  debug

          Group operations

                 joins
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

  val input = Tsv(args("input"))
  val output = Tsv(args("output"))
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

    val input = Tsv(args("input"))
    val output = Tsv(args("output"))

    input.read.write(output)

}
Distributed Copy in Scalding

class WordCountJob(args: Args) extends Job(args) {

    val input = Tsv(args("input"))
    val output = Tsv(args("output"))

    input.read.write(output)

}




                      The End.
Main Class - "Runner"

import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding

object ScaldingJobRunner extends App {

    ToolRunner.run(new Configuration, new scalding.Tool, args)

}
Main Class - "Runner"

import org.apache.hadoop.util.ToolRunner
import com.twitter.scalding

object ScaldingJobRunner extends App {          from App

    ToolRunner.run(new Configuration, new scalding.Tool, args)

}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)




}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }



    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { group => group.size('count) }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { group => group.size }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { _.size }


    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
class WordCountJob(args: Args) extends Job(args) {

    val inputFile = args("input")
    val outputFile = args("output")

    TextLine(inputFile)
      .flatMap('line -> 'word) { line: String => tokenize(line) }
      .groupBy('word) { _.size }
      .write(Tsv(outputFile))

    def tokenize(text: String): Array[String] = implemented
}
Word Count in Scalding
 class WordCountJob(args: Args) extends Job(args) {

     val inputFile = args("input")
     val outputFile = args("output")




4{
     TextLine(inputFile)
       .flatMap('line -> 'word) { line: String => tokenize(line) }
       .groupBy('word) { _.size }
       .write(Tsv(outputFile))

     def tokenize(text: String): Array[String] = implemented
 }
Word Count in Scalding
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot




M
A
P
Word Count in Scalding
run pl.project13.scala.oculus.job.WordCountJob --tool.graph

=> pl.project13.scala.oculus.job.WordCountJob0.dot




M
A
P
R
E
D
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Word Count in Scalding
TextLine(inputFile)
  .flatMap('line -> 'word) { line: String => tokenize(line) }
  .groupBy('word) { _.size('count) }
  .write(Tsv(outputFile))
Why Scalding?
Why Scalding?


 Hadoop inside
Why Scalding?


    Hadoop inside
Cascading abstractions
Why Scalding?


    Hadoop inside
Cascading abstractions
  Scala conciseness
Ask Stuff!

      Dzięki!
      Thanks!
     ありがとう!


Konrad Malawski @ java.pl
t: ktosopl / g: ktoso
b: blog.project13.pl

More Related Content

What's hot

What's hot (20)

Scalding
ScaldingScalding
Scalding
 
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant) Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
Big Data Everywhere Chicago: Unleash the Power of HBase Shell (Conversant)
 
A deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internalsA deeper-understanding-of-spark-internals
A deeper-understanding-of-spark-internals
 
Meet scala
Meet scalaMeet scala
Meet scala
 
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLabApache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
Apache Spark - Key-Value RDD | Big Data Hadoop Spark Tutorial | CloudxLab
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUG
 
Modern technologies in data science
Modern technologies in data science Modern technologies in data science
Modern technologies in data science
 
Introduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduceIntroduction to Hadoop and MapReduce
Introduction to Hadoop and MapReduce
 
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
Apache Spark - Basics of RDD & RDD Operations | Big Data Hadoop Spark Tutoria...
 
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
MongoDB World 2019: Creating a Self-healing MongoDB Replica Set on GCP Comput...
 
Scalding Presentation
Scalding PresentationScalding Presentation
Scalding Presentation
 
Spark Schema For Free with David Szakallas
 Spark Schema For Free with David Szakallas Spark Schema For Free with David Szakallas
Spark Schema For Free with David Szakallas
 
Requery overview
Requery overviewRequery overview
Requery overview
 
Polyglot Persistence
Polyglot PersistencePolyglot Persistence
Polyglot Persistence
 
Hive - SerDe and LazySerde
Hive - SerDe and LazySerdeHive - SerDe and LazySerde
Hive - SerDe and LazySerde
 
Cassandra 3.0 - JSON at scale - StampedeCon 2015
Cassandra 3.0 - JSON at scale - StampedeCon 2015Cassandra 3.0 - JSON at scale - StampedeCon 2015
Cassandra 3.0 - JSON at scale - StampedeCon 2015
 
Introduction to MapReduce and Hadoop
Introduction to MapReduce and HadoopIntroduction to MapReduce and Hadoop
Introduction to MapReduce and Hadoop
 
Making an Object System with Tcl 8.5
Making an Object System with Tcl 8.5Making an Object System with Tcl 8.5
Making an Object System with Tcl 8.5
 
2017 02-07 - elastic & spark. building a search geo locator
2017 02-07 - elastic & spark. building a search geo locator2017 02-07 - elastic & spark. building a search geo locator
2017 02-07 - elastic & spark. building a search geo locator
 
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
Apache Spark - Key Value RDD - Transformations | Big Data Hadoop Spark Tutori...
 

Viewers also liked

Ebay legacy-code-retreat
Ebay legacy-code-retreatEbay legacy-code-retreat
Ebay legacy-code-retreat
Konrad Malawski
 
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka StreamsFresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Konrad Malawski
 

Viewers also liked (20)

Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014Spark at Twitter - Seattle Spark Meetup, April 2014
Spark at Twitter - Seattle Spark Meetup, April 2014
 
Unit testing pig
Unit testing pigUnit testing pig
Unit testing pig
 
Practical Pig and PigUnit (Michael Noll, Verisign)
Practical Pig and PigUnit (Michael Noll, Verisign)Practical Pig and PigUnit (Michael Noll, Verisign)
Practical Pig and PigUnit (Michael Noll, Verisign)
 
How LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product DevelopmentHow LinkedIn Uses Scalding for Data Driven Product Development
How LinkedIn Uses Scalding for Data Driven Product Development
 
Android at-xsolve
Android at-xsolveAndroid at-xsolve
Android at-xsolve
 
Scala dsls-dissecting-and-implementing-rogue
Scala dsls-dissecting-and-implementing-rogueScala dsls-dissecting-and-implementing-rogue
Scala dsls-dissecting-and-implementing-rogue
 
TDD drogą do oświecenia w Scali
TDD drogą do oświecenia w ScaliTDD drogą do oświecenia w Scali
TDD drogą do oświecenia w Scali
 
Git tak po prostu (SFI version)
Git tak po prostu (SFI version)Git tak po prostu (SFI version)
Git tak po prostu (SFI version)
 
JavaOne 2013: Java 8 - The Good Parts
JavaOne 2013: Java 8 - The Good PartsJavaOne 2013: Java 8 - The Good Parts
JavaOne 2013: Java 8 - The Good Parts
 
Open soucerers - jak zacząć swoją przygodę z open source
Open soucerers - jak zacząć swoją przygodę z open sourceOpen soucerers - jak zacząć swoją przygodę z open source
Open soucerers - jak zacząć swoją przygodę z open source
 
Android my Scala @ JFokus 2013
Android my Scala @ JFokus 2013Android my Scala @ JFokus 2013
Android my Scala @ JFokus 2013
 
HBase RowKey design for Akka Persistence
HBase RowKey design for Akka PersistenceHBase RowKey design for Akka Persistence
HBase RowKey design for Akka Persistence
 
Need for Async: Hot pursuit for scalable applications
Need for Async: Hot pursuit for scalable applicationsNeed for Async: Hot pursuit for scalable applications
Need for Async: Hot pursuit for scalable applications
 
Ebay legacy-code-retreat
Ebay legacy-code-retreatEbay legacy-code-retreat
Ebay legacy-code-retreat
 
Hadoop Summit 2012 | Optimizing MapReduce Job Performance
Hadoop Summit 2012 | Optimizing MapReduce Job PerformanceHadoop Summit 2012 | Optimizing MapReduce Job Performance
Hadoop Summit 2012 | Optimizing MapReduce Job Performance
 
KrakDroid: Scala on Android
KrakDroid: Scala on AndroidKrakDroid: Scala on Android
KrakDroid: Scala on Android
 
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
[Tokyo Scala User Group] Akka Streams & Reactive Streams (0.7)
 
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
100th SCKRK Meeting - best software engineering papers of 5 years of SCKRK
 
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka StreamsFresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
Fresh from the Oven (04.2015): Experimental Akka Typed and Akka Streams
 
Disrupt 2 Grow - Devoxx 2013
Disrupt 2 Grow - Devoxx 2013Disrupt 2 Grow - Devoxx 2013
Disrupt 2 Grow - Devoxx 2013
 

Similar to Scalding - Hadoop Word Count in LESS than 70 lines of code

Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
Dmitry Buzdin
 
Behm Shah Pagerank
Behm Shah PagerankBehm Shah Pagerank
Behm Shah Pagerank
gothicane
 
Open XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand DechouxOpen XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand Dechoux
Publicis Sapient Engineering
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
Databricks
 
JRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusJRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop Papyrus
Koichi Fujikawa
 

Similar to Scalding - Hadoop Word Count in LESS than 70 lines of code (20)

Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
Scalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedInScalable and Flexible Machine Learning With Scala @ LinkedIn
Scalable and Flexible Machine Learning With Scala @ LinkedIn
 
Behm Shah Pagerank
Behm Shah PagerankBehm Shah Pagerank
Behm Shah Pagerank
 
Stream or not to Stream?

Stream or not to Stream?
Stream or not to Stream?

Stream or not to Stream?

 
Osd ctw spark
Osd ctw sparkOsd ctw spark
Osd ctw spark
 
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
Neatly Hashing a Tree: FP tree-fold in Perl5 & Perl6
 
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design PathshalaAdvance Map reduce - Apache hadoop Bigdata training by Design Pathshala
Advance Map reduce - Apache hadoop Bigdata training by Design Pathshala
 
Introduction to Spark with Scala
Introduction to Spark with ScalaIntroduction to Spark with Scala
Introduction to Spark with Scala
 
Open XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand DechouxOpen XKE - Big Data, Big Mess par Bertrand Dechoux
Open XKE - Big Data, Big Mess par Bertrand Dechoux
 
Apache Spark - Aram Mkrtchyan
Apache Spark - Aram MkrtchyanApache Spark - Aram Mkrtchyan
Apache Spark - Aram Mkrtchyan
 
Hw09 Hadoop + Clojure
Hw09   Hadoop + ClojureHw09   Hadoop + Clojure
Hw09 Hadoop + Clojure
 
Hadoop
HadoopHadoop
Hadoop
 
Hadoop + Clojure
Hadoop + ClojureHadoop + Clojure
Hadoop + Clojure
 
Apache Spark for Library Developers with William Benton and Erik Erlandson
 Apache Spark for Library Developers with William Benton and Erik Erlandson Apache Spark for Library Developers with William Benton and Erik Erlandson
Apache Spark for Library Developers with William Benton and Erik Erlandson
 
Scala @ TechMeetup Edinburgh
Scala @ TechMeetup EdinburghScala @ TechMeetup Edinburgh
Scala @ TechMeetup Edinburgh
 
Tuples All the Way Down
Tuples All the Way DownTuples All the Way Down
Tuples All the Way Down
 
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
Scala and big data in ICM. Scoobie, Scalding, Spark, Stratosphere. Scalar 2014
 
Apache Spark on Apache HBase: Current and Future
Apache Spark on Apache HBase: Current and Future Apache Spark on Apache HBase: Current and Future
Apache Spark on Apache HBase: Current and Future
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in Cassandra
 
JRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop PapyrusJRubyKaigi2010 Hadoop Papyrus
JRubyKaigi2010 Hadoop Papyrus
 

More from Konrad Malawski

Reactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka StreamsReactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka Streams
Konrad Malawski
 

More from Konrad Malawski (20)

Networks and Types - the Future of Akka @ ScalaDays NYC 2018
Networks and Types - the Future of Akka @ ScalaDays NYC 2018Networks and Types - the Future of Akka @ ScalaDays NYC 2018
Networks and Types - the Future of Akka @ ScalaDays NYC 2018
 
Akka Typed (quick talk) - JFokus 2018
Akka Typed (quick talk) - JFokus 2018Akka Typed (quick talk) - JFokus 2018
Akka Typed (quick talk) - JFokus 2018
 
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in'tScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
ScalaSwarm 2017 Keynote: Tough this be madness yet theres method in't
 
State of Akka 2017 - The best is yet to come
State of Akka 2017 - The best is yet to comeState of Akka 2017 - The best is yet to come
State of Akka 2017 - The best is yet to come
 
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYCBuilding a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
Building a Reactive System with Akka - Workshop @ O'Reilly SAConf NYC
 
Akka-chan's Survival Guide for the Streaming World
Akka-chan's Survival Guide for the Streaming WorldAkka-chan's Survival Guide for the Streaming World
Akka-chan's Survival Guide for the Streaming World
 
Reactive integrations with Akka Streams
Reactive integrations with Akka StreamsReactive integrations with Akka Streams
Reactive integrations with Akka Streams
 
Not Only Streams for Akademia JLabs
Not Only Streams for Akademia JLabsNot Only Streams for Akademia JLabs
Not Only Streams for Akademia JLabs
 
Reactive Streams, j.u.concurrent & Beyond!
Reactive Streams, j.u.concurrent & Beyond!Reactive Streams, j.u.concurrent & Beyond!
Reactive Streams, j.u.concurrent & Beyond!
 
End to End Akka Streams / Reactive Streams - from Business to Socket
End to End Akka Streams / Reactive Streams - from Business to SocketEnd to End Akka Streams / Reactive Streams - from Business to Socket
End to End Akka Streams / Reactive Streams - from Business to Socket
 
The Cloud-natives are RESTless @ JavaOne
The Cloud-natives are RESTless @ JavaOneThe Cloud-natives are RESTless @ JavaOne
The Cloud-natives are RESTless @ JavaOne
 
Akka Streams in Action @ ScalaDays Berlin 2016
Akka Streams in Action @ ScalaDays Berlin 2016Akka Streams in Action @ ScalaDays Berlin 2016
Akka Streams in Action @ ScalaDays Berlin 2016
 
Krakow communities @ 2016
Krakow communities @ 2016Krakow communities @ 2016
Krakow communities @ 2016
 
The things we don't see – stories of Software, Scala and Akka
The things we don't see – stories of Software, Scala and AkkaThe things we don't see – stories of Software, Scala and Akka
The things we don't see – stories of Software, Scala and Akka
 
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
[Japanese] How Reactive Streams and Akka Streams change the JVM Ecosystem @ R...
 
Zen of Akka
Zen of AkkaZen of Akka
Zen of Akka
 
How Reactive Streams & Akka Streams change the JVM Ecosystem
How Reactive Streams & Akka Streams change the JVM EcosystemHow Reactive Streams & Akka Streams change the JVM Ecosystem
How Reactive Streams & Akka Streams change the JVM Ecosystem
 
The Need for Async @ ScalaWorld
The Need for Async @ ScalaWorldThe Need for Async @ ScalaWorld
The Need for Async @ ScalaWorld
 
Reactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka StreamsReactive Stream Processing with Akka Streams
Reactive Stream Processing with Akka Streams
 
Reactive Streams / Akka Streams - GeeCON Prague 2014
Reactive Streams / Akka Streams - GeeCON Prague 2014Reactive Streams / Akka Streams - GeeCON Prague 2014
Reactive Streams / Akka Streams - GeeCON Prague 2014
 

Recently uploaded

Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
Joaquim Jorge
 

Recently uploaded (20)

Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
Understanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdfUnderstanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdf
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
Artificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and MythsArtificial Intelligence: Facts and Myths
Artificial Intelligence: Facts and Myths
 
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
Deploy with confidence: VMware Cloud Foundation 5.1 on next gen Dell PowerEdg...
 
HTML Injection Attacks: Impact and Mitigation Strategies
HTML Injection Attacks: Impact and Mitigation StrategiesHTML Injection Attacks: Impact and Mitigation Strategies
HTML Injection Attacks: Impact and Mitigation Strategies
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemkeProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
ProductAnonymous-April2024-WinProductDiscovery-MelissaKlemke
 
GenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdfGenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdf
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024Top 10 Most Downloaded Games on Play Store in 2024
Top 10 Most Downloaded Games on Play Store in 2024
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Real Time Object Detection Using Open CV
Real Time Object Detection Using Open CVReal Time Object Detection Using Open CV
Real Time Object Detection Using Open CV
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 

Scalding - Hadoop Word Count in LESS than 70 lines of code

  • 1. Scalding Hadoop Word Count in < 70 lines of code Konrad 'ktoso' Malawski JARCamp #3 12.04.2013
  • 2. Scalding Hadoop Word Count in 4 lines of code Konrad 'ktoso' Malawski JARCamp #3 12.04.2013
  • 3. softwaremill.com / java.pl / sckrk.com / geecon.org / krakowscala.pl / gdgkrakow.pl
  • 7. Agenda Why Scalding? (10%) + Hadoop Basics (20%)
  • 8. Agenda Why Scalding? (10%) + Hadoop Basics (20%) +
  • 9. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%)
  • 10. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) +
  • 11. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%)
  • 12. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%) =
  • 13. Agenda Why Scalding? (10%) + Hadoop Basics (20%) + Enter Cascading (40%) + Hello Scalding (30%) = 100%
  • 14. Why Scalding? Word Count in Types type Word = String type Count = Int String => Map[Word, Count]
  • 15. Why Scalding? Word Count in Scala
  • 16. Why Scalding? Word Count in Scala val text = "a a a b b"
  • 17. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] =
  • 18. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text
  • 19. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ")
  • 20. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1))
  • 21. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1)
  • 22. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum }
  • 23. Why Scalding? Word Count in Scala val text = "a a a b b" def wordCount(text: String): Map[Word, Count] = text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map { a => a._1 -> a._2.map(_._2).sum } wordCount(text) should equal (Map("a" -> 3), ("b" -> 2)))
  • 24. Stuff > Memory Scala collections... fun but, memory bound! val text = "so many words... waaah! ..." text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 25. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 26. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 27. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 28. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) in Memory .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum))
  • 29. Stuff > Memory Scala collections... fun but, memory bound! in Memory val text = "so many words... waaah! ..." in Memory text in Memory .split(" ") .map(a => (a, 1)) in Memory .groupBy(_._1) .map(a => (a._1, a._2.map(_._2).sum)) in Memory
  • 30. Apache Hadoop (HDFS + MR) http://hadoop.apache.org/
  • 31. Why Scalding? Word Count in Hadoop MR package org.myorg; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; public class WordCount { public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); output.collect(word, one);
  • 32. private final static IntWritable one = new IntWritable(1); Why Scalding? private Text word = new Text(); public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) thro IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); Word Count in Hadoop MR output.collect(word, one); } } } public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { int sum = 0; while (values.hasNext()) { sum += values.next().get(); } output.collect(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); } }
  • 33. Trivia: How old is Hadoop?
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 44. Cascading is
  • 45. Cascading is Taps & Pipes
  • 46. Cascading is Taps & Pipes & Sinks
  • 49. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath);
  • 50. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath);
  • 51. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy");
  • 52. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef()
  • 53. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource( copyPipe, inTap )
  • 54. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap);
  • 55. 1: Distributed Copy // source Tap Tap inTap = new Hfs(new TextDelimited(true, "t"), inPath); // sink Tap Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); // a Pipe, connects taps Pipe copyPipe = new Pipe("copy"); // build the Flow FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); // run! flowConnector.connect(flowDef).complete();
  • 56. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 57. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 58. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 59. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 60. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 61. 1. DCP - Full Code public class Main { public static void main(String[] args ) { String inPath = args[0]; String outPath = args[1]; Properties props = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(props); Tap inTap = new Hfs( new TextDelimited(true, "t"), inPath); Tap outTap = new Hfs(new TextDelimited(true, "t"), outPath); Pipe copyPipe = new Pipe("copy"); FlowDef flowDef = FlowDef.flowDef() .addSource(copyPipe, inTap) .addTailSink(copyPipe, outTap); flowConnector.connect(flowDef).complete(); } }
  • 62. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath );
  • 63. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); // specify a regex operation to split the "document" text lines into a ken stream
  • 64. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); // specify a regex operation to split the "document" text lines into a ken stream Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [ ),.]" );
  • 65. 2: Word Count String docPath = args[ 0 ]; String wcPath = args[ 1 ]; Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL );
  • 66. String wcPath = args[ 1 ]; 2: Word Count 2: Word Count Properties properties = new Properties(); AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" )
  • 67. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 68. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 69. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 70. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 71. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 72. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 73. AppProps.setApplicationJarClass( props, Main.class ); HadoopFlowConnector flowConnector = new HadoopFlowConnector( props ); 2: Word Count // create source and sink taps Tap docTap = new Hfs( new TextDelimited( true, "t" ), docPath ); Tap wcTap = new Hfs( new TextDelimited( true, "t" ), wcPath ); Fields token = new Fields( "token" ); Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow
  • 74. Fields token = new Fields( "token" ); 2: Word Count Fields text = new Fields( "text" ); RegexSplitGenerator splitter = new RegexSplitGenerator( token, "[ [](),.]" ); // only returns "token" Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } }
  • 75. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); 2: Word Count How it's made // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } }
  • 76. Pipe docPipe = new Each( "token", text, splitter, Fields.RESULTS ); 2: Word Count How it's made // determine the word counts Pipe wcPipe = new Pipe( "wc", docPipe ); wcPipe = new GroupBy( wcPipe, token ); wcPipe = new Every( wcPipe, Fields.ALL, new Count(), Fields.ALL ); // connect the taps, pipes, etc., into a flow FlowDef flowDef = FlowDef.flowDef() .setName( "wc" ) .addSource( docPipe, docTap ) .addTailSink( wcPipe, wcTap ); // write a DOT file and run the flow Flow wcFlow = flowConnector.connect( flowDef ); wcFlow.writeDOT( "dot/wc.dot" ); wcFlow.complete(); } Graph representation of jobs! }
  • 77. 2: Word Count How it's made http://www.cascading.org/2012/07/09/cascading-for-the-impatient-part-2/
  • 79. How it's made val flow = FlowDef
  • 80. How it's made val flow = FlowDef // pseudo code...
  • 81. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow)
  • 82. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code...
  • 83. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code... HadoopCluster.execute(jobs)
  • 84. How it's made val flow = FlowDef // pseudo code... val jobs: List[MRJob] = flowConnector(flow) // pseudo code... HadoopCluster.execute(jobs)
  • 85. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly );
  • 86. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly ); flowDef.setDebugLevel( DebugLevel.NONE );
  • 87. Cascading tips Pipe assembly = new Pipe( "assembly" ); assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); // ... // head and tail have same name FlowDef flowDef = new FlowDef() .setName( "debug" ) .addSource( "assembly", source ) .addSink( "assembly", sink ) .addTail( assembly ); flowDef.setDebugLevel( DebugLevel.NONE ); flowConnector will NOT create the Debug pipe!
  • 88. Scalding = + Twitter Scalding github.com/twitter/scalding
  • 90. map
  • 91. map Scala: val data = 1 :: 2 :: 3 :: Nil
  • 92. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 }
  • 93. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int
  • 94. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int
  • 95. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data)
  • 96. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 }
  • 97. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } // Int => Int
  • 98. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } available in Pipe // Int => Int
  • 99. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } stays in Pipe available in Pipe // Int => Int
  • 100. map Scala: val data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } // Int => Int Scalding: IterableSource(data) .map('number -> 'doubled) { n: Int => n * 2 } must choose type! // Int => Int
  • 101. mapTo
  • 102. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil
  • 103. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 }
  • 104. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null
  • 105. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int
  • 106. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference
  • 107. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference
  • 108. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data)
  • 109. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 }
  • 110. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } // Int => Int
  • 111. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } doubled stays in Pipe // Int => Int
  • 112. mapTo Scala: var data = 1 :: 2 :: 3 :: Nil val doubled = data map { _ * 2 } data = null // Int => Int release reference Scalding: IterableSource(data) .mapTo('doubled) { n: Int => n * 2 } number is removed doubled stays in Pipe // Int => Int
  • 114. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String]
  • 115. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String
  • 116. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String]
  • 117. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int]
  • 118. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int]
  • 119. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 120. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 121. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String]
  • 122. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String]
  • 123. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int]
  • 124. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",") // Array[String] } map { _.toInt } // List[Int] numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",") } // like List[String] .map('word -> 'number) { _.toInt } // like List[Int] MR map outside
  • 126. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String]
  • 127. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String
  • 128. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int]
  • 129. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] }
  • 130. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int]
  • 131. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 132. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3))
  • 133. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String]
  • 134. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) }
  • 135. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int]
  • 136. flatMap Scala: val data = "1" :: "2,2" :: "3,3,3" :: Nil // List[String] val numbers = data flatMap { line => // String line.split(",").map(_.toInt) // Array[Int] } numbers // List[Int] numbers should equal (List(1, 2, 2, 3, 3, 3)) Scalding: TextLine(data) // like List[String] .flatMap('line -> 'word) { _.split(",").map(_.toInt) } // like List[Int] map inside Scala
  • 138. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int]
  • 139. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 }
  • 140. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int]
  • 141. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2))
  • 142. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42))
  • 143. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42))
  • 144. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num)
  • 145. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }
  • 146. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) }
  • 147. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) } groups all with == value
  • 148. groupBy Scala: val data = 1 :: 2 :: 30 :: 42 :: Nil // List[Int] val groups = data groupBy { _ < 10 } groups // Map[Boolean, Int] groups(true) should equal (List(1, 2)) groups(false) should equal (List(30, 42)) Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.size('size) } groups all with == value => 'size
  • 151. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 }
  • 152. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) }
  • 153. groupBy Scalding: IterableSource(List(1, 2, 30, 42), 'num) .map('num -> 'lessThanTen) { i: Int => i < 10 } .groupBy('lessThanTen) { _.sum('total) } 'total = [3, 74]
  • 155. Scalding API project / discard
  • 156. Scalding API project / discard map / mapTo
  • 157. Scalding API project / discard map / mapTo flatMap / flatMapTo
  • 158. Scalding API project / discard map / mapTo flatMap / flatMapTo rename
  • 159. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter
  • 160. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique
  • 161. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle
  • 162. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit
  • 163. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug
  • 164. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug Group operations
  • 165. Scalding API project / discard map / mapTo flatMap / flatMapTo rename filter unique groupBy / groupAll / groupRandom / shuffle limit debug Group operations joins
  • 166. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) {
  • 167. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output"))
  • 168. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output")) input.read.write(output) }
  • 169. Distributed Copy in Scalding class WordCountJob(args: Args) extends Job(args) { val input = Tsv(args("input")) val output = Tsv(args("output")) input.read.write(output) } The End.
  • 170. Main Class - "Runner" import org.apache.hadoop.util.ToolRunner import com.twitter.scalding object ScaldingJobRunner extends App { ToolRunner.run(new Configuration, new scalding.Tool, args) }
  • 171. Main Class - "Runner" import org.apache.hadoop.util.ToolRunner import com.twitter.scalding object ScaldingJobRunner extends App { from App ToolRunner.run(new Configuration, new scalding.Tool, args) }
  • 172. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { }
  • 173. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") }
  • 174. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) }
  • 175. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } def tokenize(text: String): Array[String] = implemented }
  • 176. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size('count) } def tokenize(text: String): Array[String] = implemented }
  • 177. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { group => group.size } def tokenize(text: String): Array[String] = implemented }
  • 178. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } def tokenize(text: String): Array[String] = implemented }
  • 179. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile)) def tokenize(text: String): Array[String] = implemented }
  • 180. Word Count in Scalding class WordCountJob(args: Args) extends Job(args) { val inputFile = args("input") val outputFile = args("output") 4{ TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size } .write(Tsv(outputFile)) def tokenize(text: String): Array[String] = implemented }
  • 181. Word Count in Scalding
  • 182. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph
  • 183. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot
  • 184. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot M A P
  • 185. Word Count in Scalding run pl.project13.scala.oculus.job.WordCountJob --tool.graph => pl.project13.scala.oculus.job.WordCountJob0.dot M A P R E D
  • 186. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 187. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 188. Word Count in Scalding TextLine(inputFile) .flatMap('line -> 'word) { line: String => tokenize(line) } .groupBy('word) { _.size('count) } .write(Tsv(outputFile))
  • 191. Why Scalding? Hadoop inside Cascading abstractions
  • 192. Why Scalding? Hadoop inside Cascading abstractions Scala conciseness
  • 193. Ask Stuff! Dzięki! Thanks! ありがとう! Konrad Malawski @ java.pl t: ktosopl / g: ktoso b: blog.project13.pl