SlideShare a Scribd company logo
1 of 23
M.C. Kang
   Hello World Using Spring for Apache Hadoop

Declaring a Hadoop job using Spring’s Hadoop namespace
<configuration>
           fs.default.name=hdfs://localhost:9000
</configuration>
<job id="wordcountJob"
           input-path="/user/gutenberg/input"
           output-path="/user/gutenberg/output"
           mapper="org.apache.hadoop.examples.WordCount.TokenizerMapper"
           reducer="org.apache.hadoop.examples.WordCount.IntSumReducer"/>
           <job-runner id="runner" job="wordcountJob" run-at-startup="true"/>

This configuration will create a singleton instance of an org.apache.hadoop.mapreduce.Job managed by the
Spring container.
Spring can determine that outputKeyClass is of the type org.apache.hadoop.io.Text and that
outputValueClass is of type org.apache.hadoop.io.IntWritable, so we do not need to set these
properties explicitly.

public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
      private final static IntWritable one = new IntWritable(1);
      private Text word = new Text();
      public void map(Object key, Text value, Context context)
      throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                  word.set(itr.nextToken());
                  context.write(word, one);
            }
      }
}
   Hello World Using Spring for Apache Hadoop
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
      private IntWritable result = new IntWritable();
      public void reduce(Text key, Iterable<IntWritable> values, Context context)
      throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                       sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
      }
}


public class Main {
      private static final String[] CONFIGS = new String[] {"META-INF/spring/hadoop-
      context.xml" };

      public static void main(String[] args) {
            String[] res = (args != null && args.length > 0 ? args : CONFIGS);
            AbstractApplicationContext ctx = new ClassPathXmlApplicationContext(res);
            // shut down the context cleanly along with the VM
            ctx.registerShutdownHook();
      }
}
   Externalize the configuration parameters of the application
<context:property-placeholder location="hadoop-default.properties"/>
<configuration>
      fs.default.name=${hd.fs}
</configuration>
<job id="wordcountJob"
      input-path="${wordcount.input.path}"
      output-path="${wordcount.output.path}"
      mapper="org.apache.hadoop.examples.WordCount.TokenizerMapper"
      reducer="org.apache.hadoop.examples.WordCount.IntSumReducer"/>
<job-runner id="runner" job="wordcountJob" run-at-startup="true"/>

hd.fs=hdfs://localhost:9000
wordcount.input.path=/user/gutenberg/input/
wordcount.output.path=/user/gutenberg/output/


<context:property-placeholder location="hadoop-${ENV:default}.properties"/>
   Scripting HDFS on the JVM – Type1
<context:property-placeholder location="hadoop.properties"/>
<configuration>
      fs.default.name=${hd.fs}
</configuration>
<script id="setupScript" location="copy-files.groovy">
      <property name="localSourceFile" value="${localSourceFile}"/>
      <property name=“hdfsInputDir" value="${hdfsInputDir}"/>
      <property name=“hdfsOutputDir" value="${hdfsOutputDir}"/>
</script>



       Groovy Script
if (!fsh.test(hdfsInputDir)) {
      fsh.mkdir(hdfsInputDir);
      fsh.copyFromLocal(localSourceFile, hdfsInputDir);
      fsh.chmod(700, hdfsInputDir)
}
if (fsh.test(hdfsOutputDir)) {
      fsh.rmr(hdfsOutputDir)
}
   Combining HDFS Scripting and Job Submission
<context:property-placeholder location="hadoop.properties"/>

<configuration>
      fs.default.name=${hd.fs}
</configuration>

<job id="wordcountJob"
      input-path="${wordcount.input.path}"
      output-path="${wordcount.output.path}"
      mapper="org.apache.hadoop.examples.WordCount.TokenizerMapper"
      reducer="org.apache.hadoop.examples.WordCount.IntSumReducer"/>

<script id="setupScript" location="copy-files.groovy">
       <property name="localSourceFile" value="${localSourceFile}"/>
       <property name="inputDir" value="${wordcount.input.path}"/>
       <property name="outputDir" value="${wordcount.output.path}"/>
</script>

<job-runner id="runner" run-at-startup="true"
      pre-action="setupScript"
      job="wordcountJob"/>
   Configuring the JobRunner to execute multiple HDFS scripts and jobs
<job-runner id="runner"
      pre-action="setupScript1,setupScript"
      job="wordcountJob1,wordcountJob2"
      post-action="cleanupScript1,cleanupScript2"/>
   Scheduling MapReduce Jobs with a TaskScheduler
<!-- job definition as before -->
<job id="wordcountJob" ... />
<!-- script definition as before -->
<script id="setupScript" ... />

<job-runner id="runner" pre-action="setupScript" job="wordcountJob"/>

<task:scheduled-tasks>
      <task:scheduled ref="runner" method="call" cron="3/30 * * * * ?"/>
</task:scheduled-tasks>



   Scheduling MapReduce Jobs with Quartz
<bean id="jobDetail"
      class="org.springframework.scheduling.quartz.MethodInvokingJobDetailFactoryBean">
      <property name="targetObject" ref="runner"/>
      <property name="targetMethod" value="run"/>
</bean>
<bean id="cronTrigger" class="org.springframework.scheduling.quartz.CronTriggerBean">
      <property name="jobDetail" ref="jobDetail"/>
      <property name="cronExpression" value="3/30 * * * * ?"/>
</bean>
<bean class="org.springframework.scheduling.quartz.SchedulerFactoryBean">
      <property name="triggers" ref="cronTrigger"/>
</bean>
   Creating and configuring a Hive server
<context:property-placeholder location="hadoop.properties,hive.properties" />
<configuration id="hadoopConfiguration">
       fs.default.name=${hd.fs}
       mapred.job.tracker=${mapred.job.tracker}
</configuration>
<hive-server port="${hive.port}" auto-startup="false"
       configuration-ref="hadoopConfiguration"
       properties-location="hive-server.properties">
       hive.exec.scratchdir=/tmp/hive/
</hive-server>



   Hive Thrift Client
since the HiveClient is not a thread-safe class, so a new instance needs to be created inside
methods that are shared across multiple threads.

<hive-client-factory host="${hive.host}" port="${hive.port}"/>
   Hive Thrift Client Sample
@Repository
public class HivePasswordRepository implements PasswordRepository {
      private static final Log logger = LogFactory.getLog(HivePasswordRepository.class);
      private HiveClientFactory hiveClientFactory;
      private String tableName;
      // constructor and setters omitted
      @Override
      public Long count() {
            HiveClient hiveClient = hiveClientFactory.getHiveClient();
            try {
                   hiveClient.execute("select count(*) from " + tableName);
                   return Long.parseLong(hiveClient.fetchOne());
            // checked exceptions
            } catch (HiveServerException ex) {
                   throw translateExcpetion(ex);
            } catch (org.apache.thrift.TException tex) {
                   throw translateExcpetion(tex);
            } finally {
                   try {
                          hiveClient.shutdown();
                   } catch (org.apache.thrift.TException tex) {
                          logger.debug("Unexpected exception on shutting down HiveClient", tex);
                   }
            }
      }
      …
   Hive JDBC Client
The JDBC support for Hive lets you use your existing Spring knowledge of JdbcTemplate to
interact with Hive. Hive provides a HiveDriver class.

<bean id="hiveDriver" class="org.apache.hadoop.hive.jdbc.HiveDriver" />
<bean id="dataSource" class="org.springframework.jdbc.datasource.SimpleDriverDataSource">
      <constructor-arg name="driver" ref="hiveDriver" />
      <constructor-arg name="url" value="${hive.url}"/>
</bean>
<bean id="jdbcTemplate" class="org.springframework.jdbc.core.simple.JdbcTemplate">
      <constructor-arg ref="dataSource" />
</bean>


   Hive JDBC Client Sample
@Repository
public class JdbcPasswordRepository implements PasswordRepository {
      private JdbcOperations jdbcOperations;
      private String tableName;
      // constructor and setters omitted
      @Override
      public Long count() {
            return jdbcOperations.queryForLong("select count(*) from " + tableName);
      }
      …
   Hive Script Runner
<context:property-placeholder location="hadoop.properties,hive.properties"/>

<configuration>
      fs.default.name=${hd.fs}
      mapred.job.tracker=${mapred.job.tracker}
</configuration>

<hive-server port="${hive.port}"
      properties-location="hive-server.properties"/>

<hive-client-factory host="${hive.host}" port="${hive.port}"/>

<hive-runner id="hiveRunner" run-at-startup="false" >
       <script location="apache-log-simple.hql">
              <arguments>
                     hiveContribJar=${hiveContribJar}
                     localInPath="./data/apache.log"
              </arguments>
       </script>
</hive-runner>
   Creating and configuring a Pig server
<context:property-placeholder location="hadoop.properties" />
<configuration>
      fs.default.name=${hd.fs}
      mapred.job.tracker=${mapred.job.tracker}
</configuration>
<pig-factory properties-location="pig-server.properties"
      <script location="initialization.pig">
            <arguments>
                  inputFile=${initInputFile}
            </arguments>
      </script>
<pig-factory/>


   Pig Runner
PigRunner helper class to provide a convenient way to repeatedly execute Pig jobs and also
execute HDFS scripts before and after their execution.

<pig-runner id="pigRunner"
      pre-action="hdfsScript"
      run-at-startup="true" >
      <script location="password-analysis.pig">
            <arguments>
                  inputDir=${inputDir}
                  outputDir=${outputDir}
            </arguments>
      </script>
</pig-runner>
   Pig Template
<context:property-placeholder location="hadoop.properties" />
<configuration>
      fs.default.name=${hd.fs}
      mapred.job.tracker=${mapred.job.tracker}
</configuration>
<pig-factory properties-location="pig-server.properties"
      <script location="initialization.pig">
            <arguments>
                  inputFile=${initInputFile}
            </arguments>
      </script>
<pig-factory/>


   Pig Runner
PigRunner helper class to provide a convenient way to repeatedly execute Pig jobs and also
execute HDFS scripts before and after their execution.

<pig-runner id="pigRunner"
      pre-action="hdfsScript"
      run-at-startup="true" >
      <script location="password-analysis.pig">
            <arguments>
                  inputDir=${inputDir}
                  outputDir=${outputDir}
            </arguments>
      </script>
</pig-runner>
   Pig Runner Example
@Component
public class AnalysisService {
      private PigRunner pigRunner;
      @Autowired
      public AnalysisService(PigRunner pigRunner)
             this.pigRunner = pigRunner;
      }
      @Async
      public void performAnalysis() {
             pigRunner.call();
      }
}
   Controlling Runtime Script Execution
To have more runtime control over what Pig scripts are executed and the arguments passed into
them, we can use the PigTemplate class.

<pig-factory properties-location="pig-server.properties"/>
<pig-template/>
<beans:bean id="passwordRepository“
class="com.oreilly.springdata.hadoop.pig.PigPasswordRepository">
           <beans:constructor-arg ref="pigTemplate"/>
</beans:bean>

public class PigPasswordRepository implements PasswordRepository {
      private PigOperations pigOperations;
      private String pigScript = "classpath:password-analysis.pig";
      public void processPasswordFile(String inputFile) {
             Assert.notNull(inputFile);
             String outputDir =
             PathUtils.format("/data/password-repo/output/%1$tY/%1$tm/%1$td/%1$tH/%1$tM/
             %1$tS");
             Properties scriptParameters = new Properties();
             scriptParameters.put("inputDir", inputFile);
             scriptParameters.put("outputDir", outputDir);
             pigOperations.executeScript(pigScript, scriptParameters);
      }
      @Override
      public void processPasswordFiles(Collection<String> inputFiles) {
             for (String inputFile : inputFiles) {
                   processPasswordFile(inputFile);
             }
      }
}
   HBASE Java Client w/o Spring Data
The HTable class is the main way in Java to interact with HBase. It allows you to put data into a
table using a Put class, get data by key using a Get class, and delete data using a Delete
class.
You query that data using a Scan class, which lets you specify key ranges as well as filter
criteria.

Configuration configuration = new Configuration(); // Hadoop configuration object
HTable table = new HTable(configuration, "users");
Put p = new Put(Bytes.toBytes("user1"));
p.add(Bytes.toBytes("cfInfo"), Bytes.toBytes("qUser"), Bytes.toBytes("user1"));
p.add(Bytes.toBytes("cfInfo"), Bytes.toBytes("qEmail"), Bytes.toBytes("user1@yahoo.com"));
p.add(Bytes.toBytes("cfInfo"), Bytes.toBytes("qPassword"), Bytes.toBytes("user1pwd"));
table.put(p);
   HBASE Client w/ Spring Data HBaseTemplate
The HBase API requires that you work with the data as byte arrays and not other primitive types.
The HTable class is also not thread safe, and requires you to carefully manage the underlying
resources it uses and catch HBase-specific exceptions.
Spring’s HBaseTemplate class provides a higher-level abstraction for interacting with HBase.
As with other Spring template classes, it is thread-safe once created and provides exception
translation into Spring’s portable data access exception hierarchy.

<configuration>
fs.default.name=hdfs://localhost:9000
</configuration>
<hbase-configuration configuration-ref="hadoopConfiguration" />
<beans:bean id="hbaseTemplate" class="org.springframework.data.hadoop.hbase.HbaseTemplate">
<beans:property name="configuration" ref="hbaseConfiguration" />
</beans:bean>
   HBASE Client w/ Spring Data HBaseTemplate

More Related Content

What's hot

Php tips-and-tricks4128
Php tips-and-tricks4128Php tips-and-tricks4128
Php tips-and-tricks4128
PrinceGuru MS
 

What's hot (20)

Burn down the silos! Helping dev and ops gel on high availability websites
Burn down the silos! Helping dev and ops gel on high availability websitesBurn down the silos! Helping dev and ops gel on high availability websites
Burn down the silos! Helping dev and ops gel on high availability websites
 
[Srijan Wednesday Webinars] Ruling Drupal 8 with #d8rules
[Srijan Wednesday Webinars] Ruling Drupal 8 with #d8rules[Srijan Wednesday Webinars] Ruling Drupal 8 with #d8rules
[Srijan Wednesday Webinars] Ruling Drupal 8 with #d8rules
 
Creating Reusable Puppet Profiles
Creating Reusable Puppet ProfilesCreating Reusable Puppet Profiles
Creating Reusable Puppet Profiles
 
Zend framework service
Zend framework serviceZend framework service
Zend framework service
 
Fatc
FatcFatc
Fatc
 
Temporary Cache Assistance (Transients API): WordCamp Phoenix 2014
Temporary Cache Assistance (Transients API): WordCamp Phoenix 2014Temporary Cache Assistance (Transients API): WordCamp Phoenix 2014
Temporary Cache Assistance (Transients API): WordCamp Phoenix 2014
 
Integrating icinga2 and the HashiCorp suite
Integrating icinga2 and the HashiCorp suiteIntegrating icinga2 and the HashiCorp suite
Integrating icinga2 and the HashiCorp suite
 
The effective use of Django ORM
The effective use of Django ORMThe effective use of Django ORM
The effective use of Django ORM
 
Drupal 8: Fields reborn
Drupal 8: Fields rebornDrupal 8: Fields reborn
Drupal 8: Fields reborn
 
Apache Hive Hook
Apache Hive HookApache Hive Hook
Apache Hive Hook
 
Augeas @RMLL 2012
Augeas @RMLL 2012Augeas @RMLL 2012
Augeas @RMLL 2012
 
What's new in jQuery 1.5
What's new in jQuery 1.5What's new in jQuery 1.5
What's new in jQuery 1.5
 
Read, store and create xml and json
Read, store and create xml and jsonRead, store and create xml and json
Read, store and create xml and json
 
The IoC Hydra - Dutch PHP Conference 2016
The IoC Hydra - Dutch PHP Conference 2016The IoC Hydra - Dutch PHP Conference 2016
The IoC Hydra - Dutch PHP Conference 2016
 
Observability with Consul Connect
Observability with Consul ConnectObservability with Consul Connect
Observability with Consul Connect
 
Php tips-and-tricks4128
Php tips-and-tricks4128Php tips-and-tricks4128
Php tips-and-tricks4128
 
Scaling Symfony2 apps with RabbitMQ - Symfony UK Meetup
Scaling Symfony2 apps with RabbitMQ - Symfony UK MeetupScaling Symfony2 apps with RabbitMQ - Symfony UK Meetup
Scaling Symfony2 apps with RabbitMQ - Symfony UK Meetup
 
A Little Backbone For Your App
A Little Backbone For Your AppA Little Backbone For Your App
A Little Backbone For Your App
 
Having Fun with Play
Having Fun with PlayHaving Fun with Play
Having Fun with Play
 
Electrify your code with PHP Generators
Electrify your code with PHP GeneratorsElectrify your code with PHP Generators
Electrify your code with PHP Generators
 

Viewers also liked (6)

Spring data ii
Spring data iiSpring data ii
Spring data ii
 
Spring data
Spring dataSpring data
Spring data
 
Managementcontrol Brandweer Steller André Maranus.2
Managementcontrol Brandweer Steller André Maranus.2Managementcontrol Brandweer Steller André Maranus.2
Managementcontrol Brandweer Steller André Maranus.2
 
Cloudfront private distribution 개요
Cloudfront private distribution 개요Cloudfront private distribution 개요
Cloudfront private distribution 개요
 
Class loader basic
Class loader basicClass loader basic
Class loader basic
 
The Outcome Economy
The Outcome EconomyThe Outcome Economy
The Outcome Economy
 

Similar to Spring data iii

Hd insight programming
Hd insight programmingHd insight programming
Hd insight programming
Casear Chu
 
Writing robust Node.js applications
Writing robust Node.js applicationsWriting robust Node.js applications
Writing robust Node.js applications
Tom Croucher
 
How and why i roll my own node.js framework
How and why i roll my own node.js frameworkHow and why i roll my own node.js framework
How and why i roll my own node.js framework
Ben Lin
 
HTML5: huh, what is it good for?
HTML5: huh, what is it good for?HTML5: huh, what is it good for?
HTML5: huh, what is it good for?
Remy Sharp
 
Overview of The Scala Based Lift Web Framework
Overview of The Scala Based Lift Web FrameworkOverview of The Scala Based Lift Web Framework
Overview of The Scala Based Lift Web Framework
IndicThreads
 

Similar to Spring data iii (20)

Hd insight programming
Hd insight programmingHd insight programming
Hd insight programming
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in Cassandra
 
How to develop Big Data Pipelines for Hadoop, by Costin Leau
How to develop Big Data Pipelines for Hadoop, by Costin LeauHow to develop Big Data Pipelines for Hadoop, by Costin Leau
How to develop Big Data Pipelines for Hadoop, by Costin Leau
 
Serverless Ballerina
Serverless BallerinaServerless Ballerina
Serverless Ballerina
 
Flask and Angular: An approach to build robust platforms
Flask and Angular:  An approach to build robust platformsFlask and Angular:  An approach to build robust platforms
Flask and Angular: An approach to build robust platforms
 
Play!ng with scala
Play!ng with scalaPlay!ng with scala
Play!ng with scala
 
Building Lithium Apps
Building Lithium AppsBuilding Lithium Apps
Building Lithium Apps
 
Introduction to Nodejs
Introduction to NodejsIntroduction to Nodejs
Introduction to Nodejs
 
Using and scaling Rack and Rack-based middleware
Using and scaling Rack and Rack-based middlewareUsing and scaling Rack and Rack-based middleware
Using and scaling Rack and Rack-based middleware
 
Writing robust Node.js applications
Writing robust Node.js applicationsWriting robust Node.js applications
Writing robust Node.js applications
 
NoSQL and JavaScript: a Love Story
NoSQL and JavaScript: a Love StoryNoSQL and JavaScript: a Love Story
NoSQL and JavaScript: a Love Story
 
AngularJS - $http & $resource Services
AngularJS - $http & $resource ServicesAngularJS - $http & $resource Services
AngularJS - $http & $resource Services
 
Serverless archtiectures
Serverless archtiecturesServerless archtiectures
Serverless archtiectures
 
XQuery Rocks
XQuery RocksXQuery Rocks
XQuery Rocks
 
How and why i roll my own node.js framework
How and why i roll my own node.js frameworkHow and why i roll my own node.js framework
How and why i roll my own node.js framework
 
HTML5: huh, what is it good for?
HTML5: huh, what is it good for?HTML5: huh, what is it good for?
HTML5: huh, what is it good for?
 
Amazon elastic map reduce
Amazon elastic map reduceAmazon elastic map reduce
Amazon elastic map reduce
 
Server Side Swift: Vapor
Server Side Swift: VaporServer Side Swift: Vapor
Server Side Swift: Vapor
 
Overview Of Lift Framework
Overview Of Lift FrameworkOverview Of Lift Framework
Overview Of Lift Framework
 
Overview of The Scala Based Lift Web Framework
Overview of The Scala Based Lift Web FrameworkOverview of The Scala Based Lift Web Framework
Overview of The Scala Based Lift Web Framework
 

Recently uploaded

Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Safe Software
 
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
?#DUbAI#??##{{(☎️+971_581248768%)**%*]'#abortion pills for sale in dubai@
 
Architecting Cloud Native Applications
Architecting Cloud Native ApplicationsArchitecting Cloud Native Applications
Architecting Cloud Native Applications
WSO2
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Safe Software
 
Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Victor Rentea
 

Recently uploaded (20)

Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
 
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost SavingRepurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
Repurposing LNG terminals for Hydrogen Ammonia: Feasibility and Cost Saving
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
Apidays New York 2024 - The Good, the Bad and the Governed by David O'Neill, ...
 
Exploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with MilvusExploring Multimodal Embeddings with Milvus
Exploring Multimodal Embeddings with Milvus
 
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
 
[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf[BuildWithAI] Introduction to Gemini.pdf
[BuildWithAI] Introduction to Gemini.pdf
 
ICT role in 21st century education and its challenges
ICT role in 21st century education and its challengesICT role in 21st century education and its challenges
ICT role in 21st century education and its challenges
 
Architecting Cloud Native Applications
Architecting Cloud Native ApplicationsArchitecting Cloud Native Applications
Architecting Cloud Native Applications
 
Spring Boot vs Quarkus the ultimate battle - DevoxxUK
Spring Boot vs Quarkus the ultimate battle - DevoxxUKSpring Boot vs Quarkus the ultimate battle - DevoxxUK
Spring Boot vs Quarkus the ultimate battle - DevoxxUK
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
DBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor PresentationDBX First Quarter 2024 Investor Presentation
DBX First Quarter 2024 Investor Presentation
 
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
Navigating the Deluge_ Dubai Floods and the Resilience of Dubai International...
 
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 AmsterdamDEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
DEV meet-up UiPath Document Understanding May 7 2024 Amsterdam
 
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers:  A Deep Dive into Serverless Spatial Data and FMECloud Frontiers:  A Deep Dive into Serverless Spatial Data and FME
Cloud Frontiers: A Deep Dive into Serverless Spatial Data and FME
 
Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024Finding Java's Hidden Performance Traps @ DevoxxUK 2024
Finding Java's Hidden Performance Traps @ DevoxxUK 2024
 
Strategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a FresherStrategies for Landing an Oracle DBA Job as a Fresher
Strategies for Landing an Oracle DBA Job as a Fresher
 
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
Web Form Automation for Bonterra Impact Management (fka Social Solutions Apri...
 
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, AdobeApidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
Apidays New York 2024 - Scaling API-first by Ian Reasor and Radu Cotescu, Adobe
 

Spring data iii

  • 2.
  • 3. Hello World Using Spring for Apache Hadoop Declaring a Hadoop job using Spring’s Hadoop namespace <configuration> fs.default.name=hdfs://localhost:9000 </configuration> <job id="wordcountJob" input-path="/user/gutenberg/input" output-path="/user/gutenberg/output" mapper="org.apache.hadoop.examples.WordCount.TokenizerMapper" reducer="org.apache.hadoop.examples.WordCount.IntSumReducer"/> <job-runner id="runner" job="wordcountJob" run-at-startup="true"/> This configuration will create a singleton instance of an org.apache.hadoop.mapreduce.Job managed by the Spring container. Spring can determine that outputKeyClass is of the type org.apache.hadoop.io.Text and that outputValueClass is of type org.apache.hadoop.io.IntWritable, so we do not need to set these properties explicitly. public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }
  • 4. Hello World Using Spring for Apache Hadoop public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public class Main { private static final String[] CONFIGS = new String[] {"META-INF/spring/hadoop- context.xml" }; public static void main(String[] args) { String[] res = (args != null && args.length > 0 ? args : CONFIGS); AbstractApplicationContext ctx = new ClassPathXmlApplicationContext(res); // shut down the context cleanly along with the VM ctx.registerShutdownHook(); } }
  • 5. Externalize the configuration parameters of the application <context:property-placeholder location="hadoop-default.properties"/> <configuration> fs.default.name=${hd.fs} </configuration> <job id="wordcountJob" input-path="${wordcount.input.path}" output-path="${wordcount.output.path}" mapper="org.apache.hadoop.examples.WordCount.TokenizerMapper" reducer="org.apache.hadoop.examples.WordCount.IntSumReducer"/> <job-runner id="runner" job="wordcountJob" run-at-startup="true"/> hd.fs=hdfs://localhost:9000 wordcount.input.path=/user/gutenberg/input/ wordcount.output.path=/user/gutenberg/output/ <context:property-placeholder location="hadoop-${ENV:default}.properties"/>
  • 6. Scripting HDFS on the JVM – Type1 <context:property-placeholder location="hadoop.properties"/> <configuration> fs.default.name=${hd.fs} </configuration> <script id="setupScript" location="copy-files.groovy"> <property name="localSourceFile" value="${localSourceFile}"/> <property name=“hdfsInputDir" value="${hdfsInputDir}"/> <property name=“hdfsOutputDir" value="${hdfsOutputDir}"/> </script>  Groovy Script if (!fsh.test(hdfsInputDir)) { fsh.mkdir(hdfsInputDir); fsh.copyFromLocal(localSourceFile, hdfsInputDir); fsh.chmod(700, hdfsInputDir) } if (fsh.test(hdfsOutputDir)) { fsh.rmr(hdfsOutputDir) }
  • 7. Combining HDFS Scripting and Job Submission <context:property-placeholder location="hadoop.properties"/> <configuration> fs.default.name=${hd.fs} </configuration> <job id="wordcountJob" input-path="${wordcount.input.path}" output-path="${wordcount.output.path}" mapper="org.apache.hadoop.examples.WordCount.TokenizerMapper" reducer="org.apache.hadoop.examples.WordCount.IntSumReducer"/> <script id="setupScript" location="copy-files.groovy"> <property name="localSourceFile" value="${localSourceFile}"/> <property name="inputDir" value="${wordcount.input.path}"/> <property name="outputDir" value="${wordcount.output.path}"/> </script> <job-runner id="runner" run-at-startup="true" pre-action="setupScript" job="wordcountJob"/>
  • 8. Configuring the JobRunner to execute multiple HDFS scripts and jobs <job-runner id="runner" pre-action="setupScript1,setupScript" job="wordcountJob1,wordcountJob2" post-action="cleanupScript1,cleanupScript2"/>
  • 9. Scheduling MapReduce Jobs with a TaskScheduler <!-- job definition as before --> <job id="wordcountJob" ... /> <!-- script definition as before --> <script id="setupScript" ... /> <job-runner id="runner" pre-action="setupScript" job="wordcountJob"/> <task:scheduled-tasks> <task:scheduled ref="runner" method="call" cron="3/30 * * * * ?"/> </task:scheduled-tasks>  Scheduling MapReduce Jobs with Quartz <bean id="jobDetail" class="org.springframework.scheduling.quartz.MethodInvokingJobDetailFactoryBean"> <property name="targetObject" ref="runner"/> <property name="targetMethod" value="run"/> </bean> <bean id="cronTrigger" class="org.springframework.scheduling.quartz.CronTriggerBean"> <property name="jobDetail" ref="jobDetail"/> <property name="cronExpression" value="3/30 * * * * ?"/> </bean> <bean class="org.springframework.scheduling.quartz.SchedulerFactoryBean"> <property name="triggers" ref="cronTrigger"/> </bean>
  • 10.
  • 11. Creating and configuring a Hive server <context:property-placeholder location="hadoop.properties,hive.properties" /> <configuration id="hadoopConfiguration"> fs.default.name=${hd.fs} mapred.job.tracker=${mapred.job.tracker} </configuration> <hive-server port="${hive.port}" auto-startup="false" configuration-ref="hadoopConfiguration" properties-location="hive-server.properties"> hive.exec.scratchdir=/tmp/hive/ </hive-server>  Hive Thrift Client since the HiveClient is not a thread-safe class, so a new instance needs to be created inside methods that are shared across multiple threads. <hive-client-factory host="${hive.host}" port="${hive.port}"/>
  • 12. Hive Thrift Client Sample @Repository public class HivePasswordRepository implements PasswordRepository { private static final Log logger = LogFactory.getLog(HivePasswordRepository.class); private HiveClientFactory hiveClientFactory; private String tableName; // constructor and setters omitted @Override public Long count() { HiveClient hiveClient = hiveClientFactory.getHiveClient(); try { hiveClient.execute("select count(*) from " + tableName); return Long.parseLong(hiveClient.fetchOne()); // checked exceptions } catch (HiveServerException ex) { throw translateExcpetion(ex); } catch (org.apache.thrift.TException tex) { throw translateExcpetion(tex); } finally { try { hiveClient.shutdown(); } catch (org.apache.thrift.TException tex) { logger.debug("Unexpected exception on shutting down HiveClient", tex); } } } …
  • 13. Hive JDBC Client The JDBC support for Hive lets you use your existing Spring knowledge of JdbcTemplate to interact with Hive. Hive provides a HiveDriver class. <bean id="hiveDriver" class="org.apache.hadoop.hive.jdbc.HiveDriver" /> <bean id="dataSource" class="org.springframework.jdbc.datasource.SimpleDriverDataSource"> <constructor-arg name="driver" ref="hiveDriver" /> <constructor-arg name="url" value="${hive.url}"/> </bean> <bean id="jdbcTemplate" class="org.springframework.jdbc.core.simple.JdbcTemplate"> <constructor-arg ref="dataSource" /> </bean>  Hive JDBC Client Sample @Repository public class JdbcPasswordRepository implements PasswordRepository { private JdbcOperations jdbcOperations; private String tableName; // constructor and setters omitted @Override public Long count() { return jdbcOperations.queryForLong("select count(*) from " + tableName); } …
  • 14. Hive Script Runner <context:property-placeholder location="hadoop.properties,hive.properties"/> <configuration> fs.default.name=${hd.fs} mapred.job.tracker=${mapred.job.tracker} </configuration> <hive-server port="${hive.port}" properties-location="hive-server.properties"/> <hive-client-factory host="${hive.host}" port="${hive.port}"/> <hive-runner id="hiveRunner" run-at-startup="false" > <script location="apache-log-simple.hql"> <arguments> hiveContribJar=${hiveContribJar} localInPath="./data/apache.log" </arguments> </script> </hive-runner>
  • 15.
  • 16. Creating and configuring a Pig server <context:property-placeholder location="hadoop.properties" /> <configuration> fs.default.name=${hd.fs} mapred.job.tracker=${mapred.job.tracker} </configuration> <pig-factory properties-location="pig-server.properties" <script location="initialization.pig"> <arguments> inputFile=${initInputFile} </arguments> </script> <pig-factory/>  Pig Runner PigRunner helper class to provide a convenient way to repeatedly execute Pig jobs and also execute HDFS scripts before and after their execution. <pig-runner id="pigRunner" pre-action="hdfsScript" run-at-startup="true" > <script location="password-analysis.pig"> <arguments> inputDir=${inputDir} outputDir=${outputDir} </arguments> </script> </pig-runner>
  • 17. Pig Template <context:property-placeholder location="hadoop.properties" /> <configuration> fs.default.name=${hd.fs} mapred.job.tracker=${mapred.job.tracker} </configuration> <pig-factory properties-location="pig-server.properties" <script location="initialization.pig"> <arguments> inputFile=${initInputFile} </arguments> </script> <pig-factory/>  Pig Runner PigRunner helper class to provide a convenient way to repeatedly execute Pig jobs and also execute HDFS scripts before and after their execution. <pig-runner id="pigRunner" pre-action="hdfsScript" run-at-startup="true" > <script location="password-analysis.pig"> <arguments> inputDir=${inputDir} outputDir=${outputDir} </arguments> </script> </pig-runner>
  • 18. Pig Runner Example @Component public class AnalysisService { private PigRunner pigRunner; @Autowired public AnalysisService(PigRunner pigRunner) this.pigRunner = pigRunner; } @Async public void performAnalysis() { pigRunner.call(); } }
  • 19. Controlling Runtime Script Execution To have more runtime control over what Pig scripts are executed and the arguments passed into them, we can use the PigTemplate class. <pig-factory properties-location="pig-server.properties"/> <pig-template/> <beans:bean id="passwordRepository“ class="com.oreilly.springdata.hadoop.pig.PigPasswordRepository"> <beans:constructor-arg ref="pigTemplate"/> </beans:bean> public class PigPasswordRepository implements PasswordRepository { private PigOperations pigOperations; private String pigScript = "classpath:password-analysis.pig"; public void processPasswordFile(String inputFile) { Assert.notNull(inputFile); String outputDir = PathUtils.format("/data/password-repo/output/%1$tY/%1$tm/%1$td/%1$tH/%1$tM/ %1$tS"); Properties scriptParameters = new Properties(); scriptParameters.put("inputDir", inputFile); scriptParameters.put("outputDir", outputDir); pigOperations.executeScript(pigScript, scriptParameters); } @Override public void processPasswordFiles(Collection<String> inputFiles) { for (String inputFile : inputFiles) { processPasswordFile(inputFile); } } }
  • 20.
  • 21. HBASE Java Client w/o Spring Data The HTable class is the main way in Java to interact with HBase. It allows you to put data into a table using a Put class, get data by key using a Get class, and delete data using a Delete class. You query that data using a Scan class, which lets you specify key ranges as well as filter criteria. Configuration configuration = new Configuration(); // Hadoop configuration object HTable table = new HTable(configuration, "users"); Put p = new Put(Bytes.toBytes("user1")); p.add(Bytes.toBytes("cfInfo"), Bytes.toBytes("qUser"), Bytes.toBytes("user1")); p.add(Bytes.toBytes("cfInfo"), Bytes.toBytes("qEmail"), Bytes.toBytes("user1@yahoo.com")); p.add(Bytes.toBytes("cfInfo"), Bytes.toBytes("qPassword"), Bytes.toBytes("user1pwd")); table.put(p);
  • 22. HBASE Client w/ Spring Data HBaseTemplate The HBase API requires that you work with the data as byte arrays and not other primitive types. The HTable class is also not thread safe, and requires you to carefully manage the underlying resources it uses and catch HBase-specific exceptions. Spring’s HBaseTemplate class provides a higher-level abstraction for interacting with HBase. As with other Spring template classes, it is thread-safe once created and provides exception translation into Spring’s portable data access exception hierarchy. <configuration> fs.default.name=hdfs://localhost:9000 </configuration> <hbase-configuration configuration-ref="hadoopConfiguration" /> <beans:bean id="hbaseTemplate" class="org.springframework.data.hadoop.hbase.HbaseTemplate"> <beans:property name="configuration" ref="hbaseConfiguration" /> </beans:bean>
  • 23. HBASE Client w/ Spring Data HBaseTemplate