SlideShare ist ein Scribd-Unternehmen logo
1 von 25
Kamil Chmielewski          Jacek Juraszek




              Hadoop
    w poszukiwaniu złotego młotka
Źródło: IDC's Digital Universe Study, sponsored by EMC, June 2011
• Facebook – 30 PB (2011)
   • 2000 serwerów
   • 22 400 rdzeni
   • 64 TB RAM

• Yahoo – 14 PB (2010)
   • 4000 serwerów

• Ebay – 5,3 PB
   • 532 serwery
   • 4256 rdzeni

• Google – 24 PB ???
Wzrost mocy obliczeniowej




                     Źródło: The Free Lunch Is Over, Herb Sutter
Architektura HDFS
HDFS File System Shell
• hadoop fs -cat file:///file3 /user/hadoop/file4

• hadoop fs -cp /user/hadoop/file1 /user/hadoop/file2

• hadoop fs -du /user/hadoop/dir1

• hadoop fs -get hdfs://nn.example.com/user/hadoop/file localfile

• hadoop fs -ls /user/hadoop/file1

• hadoop fs -mkdir hdfs://nn1.example.com/user/hadoop/dir

• hadoop fs -mv /user/hadoop/file1 /user/hadoop/file2

• hadoop fs -put localfile hdfs://nn.example.com/hadoop/hadoopfile

• hadoop fs -rm hdfs://nn.example.com/file

• hadoop fs -tail pathname
Rozproszony klient?
NameNode HA
Zrównoleglenie – MapReduce
function map(String name, String document):
  // name: document name
  // document: document contents
  for each word w in document:
    emit (w, 1)

function reduce(String word, Iterator partialCounts):
  // word: a word
  // partialCounts: a list of aggregated partial counts
  sum = 0
  for each pc in partialCounts:
    sum += pc
  emit (word, sum)




                                        http://en.wikipedia.org/wiki/MapReduce
MapReduce – Hadoop JAVA




                             63 linie !!!




                   http://wiki.apache.org/hadoop/WordCount
MapReduce – Apache PIG
input_lines = LOAD '/tmp/my-copy-of-all-pages-on-internet'
   AS (line:chararray);

-- Extract words from each line and put them into a pig bag
-- datatype, then flatten the bag to get one word on each row
words = FOREACH input_lines GENERATE FLATTEN(TOKENIZE(line)) AS word;

-- filter out any words that are just white spaces
filtered_words = FILTER words BY word MATCHES 'w+';

-- create a group for each word                      7 linii dobrze,
word_groups = GROUP filtered_words BY word;
                                                          63 źle
-- count the entries in each group
word_count = FOREACH word_groups
   GENERATE COUNT(filtered_words) AS count, group AS word;

-- order the records by count
ordered_word_count = ORDER word_count BY count DESC;
STORE ordered_word_count INTO '/tmp/number-of-words-on-internet';


                                  http://en.wikipedia.org/wiki/Pig_(programming_tool)
Przykład z życia wzięty
    public static class MetricsMapper extends TableMapper<Text, IntWritable> {
        private final static Logger log =
    LoggerFactory.getLogger(MetricsMapper.class);

        protected void map(ImmutableBytesWritable key, Result value,                 private Map<String, String> getValuesFromQueryString(String query,
DEFINE extractor pl.allegro.cm.pig.udf.specific.Extractor();Set<String> keys) {
           Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context
DEFINE isNotBlank pl.allegro.cm.pig.udf.IsNotBlank();
   context) throws IOException,                                              String[] keyVal = split(query, '&');
           InterruptedException {                                            Map<String, String> result = new HashMap<String,          String>();
DEFINE concat pl.allegro.cm.pig.udf.Concat();
         String query =                                                      for (String s : keyVal) {
   Bytes.toString(value.getValue(RawDataFamily.CF_B, RawDataFamily.QUERY.getColumn()      String[] kv = split(s, '=');
in = LOAD 'events.$account' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('r:userId > 1) {
   ));                                                                                    if (keys.contains(kv[0]) && kv.length e:processId e:createTime
                                                                                            result.put(kv[0], kv[1]);
         Map<String, String> infoTags = getValuesFromQuery(query, KEYS);
r:query for (String key : KEYS) {
          r:direct e:newprocess', '-caster HBaseBinaryConverter') AS                      }
(userId:chararray, processId:chararray, createTime:chararray, query:chararray, direct:chararray, newprocess:chararray
                                                                                        }
           long eventTime =                                                             return result;
);
   toLong(value.getValue(EvalDataFamily.CF_B, EvalDataFamily.CREATE_TIME.getColumn() }
   ));
rows = FILTER in BY (userId IS NOT NULL) AND (processId IS NOT private String (createTime { NOT NULL);
                                                                                      NULL) AND key(String key) IS
           long eventTruncatedToDay = timestampToDay(eventTime);                        if (SOURCE.equals(key)) {
rows = FOREACH rows GENERATE SUBSTRING(createTime,0,10) AS createTime, SOURCE;            return userId, processId, query, direct, newprocess;
rows = FILTER rows BY resolveTagValue(key, value, infoTags);'$upper' >= createTime;
           String tagValue = '$lower' <= createTime AND                                 } else if (MEDIUM.equals(key)) {
           int visitCount =                                                               return MEDIUM;
   toInt(value.getValue(EvalDataFamily.CF_B, EvalDataFamily.VISIT_COUNT.getColumn())    }
processs = GROUP rows BY (userId,processId);
   );                                                                                   return key;
processs = FOREACH processs GENERATE concat(group.$0,'|',group.$1) AS countId, COUNT($1) AS count;
           context.write(new Text(eventTruncatedToDay + KEY_DELIMITER + infoKey(key) }
   + KEY_DELIMITER + tagValue), new IntWritable(                                    }
               visitCount));                                                        public static class MetricsReducer extends TableReducer<Text, IntWritable,
firstEvFromEachprocess = FILTER rows BY (newprocess IS NOT NULL); {
         }                                                                        Writable>
firstEvFromEachprocess = FOREACH firstEvFromEachprocess GENERATE createTime AS ct,key, Iterable<IntWritable> visitCounts, AS
       };                                                                             protected void reduce(Text concat(userId,'|',processId)
       private String resolveTagValue(String attr, Result                                 Reducer<Text, IntWritable, Writable, Writable>.Context context) throws
campId, extractor(query,direct) AS params;
   result, Map<String, String> allTags) {                                         IOException, InterruptedException {
         String tagValue = allTags.get(attr);                                           long visits = 0;
         tagValue = StringUtils.isBlank(tagValue) ? UNDEFINED : tagValue;               long pv = 0;
         if (SOURCE.equals(attr)) {                                                     long bounces = 0;
joinedData = JOIN firstEvFromEachprocess BY procId, processs BY countId;
           if (!UNDEFINED.equals(tagValue)) {                                           for (IntWritable vc : visitCounts) {
unpackParams = FOREACH joinedData GENERATE ct AS t, FLATTEN(params), count AS c, (count==1 ? 1 : 0) AS b;
             return tagValue;                                                             visits++;
           }                                                                              pv += vc.get();
dataForWrite = GROUP unpackParams BY (t,$1,$2);                                           bounces += vc.get() == 1 ? 1 : 0;
           String direct =
dataForWrite = FOREACH dataForWrite GENERATE
   Bytes.toString(result.getValue(RawDataFamily.CF_B, RawDataFamily.DIRECT.getColumn    }
group.t, group.$1, group.$2, SUM(unpackParams.b),SUM(unpackParams.c), COUNT(unpackParams);
   ()));                                                                                context.write(
           if (StringUtils.isNotBlank(direct)) {                                            null,
             return retrieveOrigin(direct);                                                 new Put(Bytes.toBytes(key.toString()))
STORE dataForWrite INTO 'metrics' USING
           }                                                                                    .add(Constants.CF_B, Constants.VISITS.getColumn(),
                                                                                  toBytes(visits))
org.apache.pig.piggybank.storage.DBStorage('$driver','$url','$usr','$pass','INSERT INTO metrics
           return DIRECT;
                                                                                                .add(Constants.CF_B, Constants.PV.getColumn(), toBytes(pv))
(account,else ifsource = resolveTagValue(SOURCE, result, allTags);
         }
             date,(MEDIUM.equals(attr)) {
           String    key,value, cripled, events, processs) VALUES ("$account", ?, ?, ?, ?,Constants.BOUNCES.getColumn(),
                                                                                                .add(Constants.CF_B, ?, ?) ON DUPLICATE KEY UPDATE
cripled=VALUES(cripled), events=VALUES(events), processs=VALUES(processs)');
           return source + VALUE_DELIMITER + tagValue;                            toBytes(bounces)));
         }                                                                            };
         return tagValue;                                                           }
       }

        private String retrieveHost(String url) {
          if (StringUtils.isNotBlank(url)) {
            try {
                                                                                                                                 A to jest PIG…
              return (new URL(url)).getHost().replaceFirst("www.", "");
            } catch (MalformedURLException e) {
              log.warn("Malformed URL '{}'. Could not retrieve host value.", url);
Hadoop + MongoDB



                                     HADOOP
   MongoDB      Bach proc. result
                                    archive data
  online data
                                       MR
                 Flushed data
Filesystem = HDFS ?
HBase

key           timestamp   cf dane      cf adres
80071223097   t3                       miasto=Warszawa
80071223097   t2                       miasto=Gdańsk
80071223097   t1          imie=Jan
86121267222   t2                       ulica=Długa
86121267222   t1          imie=Maria   miasto=Poznań
HTable table = new HTable("osoby");
Put event = new Put(Bytes.toBytes("80071223097")
  .add(Bytes.toBytes("dane"),
      Bytes.toBytes("imie"), Bytes.toBytes("Jan"))
  .add(Bytes.toBytes("adres"),
      Bytes.toBytes("miasto"), Bytes.toBytes("Warszawa"))
              ;
table.put(event);



// https://github.com/nearinfinity/hbase-dsl
HTable table = new HTable("osoby");
hBase.save(table).row("80071223097").
           family("dane").col("imie", "Jan").
           family("adres").col("miasto", "Warszawa");



# http://happybase.readthedocs.org/
table = connection.table('osoby')
table.put('80071223097’,
              {'dane:imie': 'Jan', 'adres:miasto': 'Warszawa'})
HBase Shell
# Count rows in a table
def _count_internal(interval = 1000, caching_rows = 10)
 # We can safely set scanner cachingwith the first key only filter
 scan = org.apache.hadoop.hbase.client.Scan.new
 scan.cache_blocks = false
 scan.caching = caching_rows
 scan.setFilter(org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter.new)

 # Run the scanner
 scanner = @table.getScanner(scan)
 count = 0
 iter = scanner.iterator

 # Iterate results
 while iter.hasNext
  row = iter.next
  count += 1
  next unless (block_given? && count % interval == 0)
  # Allow command modules to visualize counting process
  yield(count, String.from_java_bytes(row.getRow))
 end

 # Return the counter
 return count
end
Koszmar pakietowy

org.apache.hadoop.mapred             org.apache.hadoop.mapreduce

   Wszystko_mający                     Przyjazne API
   Status: legacy                      Klasy bazowe
   Chain_mr                            Konteksty
   Operacja JOIN na MR                 Wsparcie dla CLI i CoC




          Smaczki z Maven Repo:

           Przepakietowana GUAVA
           Zależności do commons-logging
           Dystrybucje tylko w 3rd party repo
           HBASE z zależnościami do: jetty i servlet-api
Bałagan z wersjami
Przykładowa architektura systemu

                       MR = Batch




           Bazy danych nadal nadają sens aplikacji
Hadoop + SOLR = SOLR Cloud
Nie każdy problem
jest dość duży…




            FACEBOOK CLUSTER
             2k maszyn
             12 TB per maszyna
             30 PB całkowitej pojemności
             1200 maszyn x 8 core
             800 maszyn X 16 core
Zastosowania
• Indeksowanie dokumentów

• Analiza wykorzystania serwisów internetowych

• Logi serwerów, firewalli

• Repozytoria obrazów, filmów

• Metryki parametrów systemów

• Systemy rekomendacji
More info …

http://hortonworks.com/blog/

http://www.cloudera.com/blog/

http://hadoopblog.blogspot.com/

http://www.larsgeorge.com/

http://natishalom.typepad.com/nati_shaloms_blog/

http://developer.yahoo.com/blogs/ydn/categories/hadoop/

http://bradhedlund.com/topics/big-data/

Weitere ähnliche Inhalte

Was ist angesagt?

Nosql hands on handout 04
Nosql hands on handout 04Nosql hands on handout 04
Nosql hands on handout 04
Krishna Sankar
 
Building Real Time Systems on MongoDB Using the Oplog at Stripe
Building Real Time Systems on MongoDB Using the Oplog at StripeBuilding Real Time Systems on MongoDB Using the Oplog at Stripe
Building Real Time Systems on MongoDB Using the Oplog at Stripe
MongoDB
 
Jggug 2010 330 Grails 1.3 観察
Jggug 2010 330 Grails 1.3 観察Jggug 2010 330 Grails 1.3 観察
Jggug 2010 330 Grails 1.3 観察
Tsuyoshi Yamamoto
 

Was ist angesagt? (20)

Doc Parsers Api Cheatsheet 1 0
Doc Parsers Api Cheatsheet 1 0Doc Parsers Api Cheatsheet 1 0
Doc Parsers Api Cheatsheet 1 0
 
Rのスコープとフレームと環境と
Rのスコープとフレームと環境とRのスコープとフレームと環境と
Rのスコープとフレームと環境と
 
Rデバッグあれこれ
RデバッグあれこれRデバッグあれこれ
Rデバッグあれこれ
 
Speed Things Up with Transients
Speed Things Up with TransientsSpeed Things Up with Transients
Speed Things Up with Transients
 
The Ring programming language version 1.6 book - Part 15 of 189
The Ring programming language version 1.6 book - Part 15 of 189The Ring programming language version 1.6 book - Part 15 of 189
The Ring programming language version 1.6 book - Part 15 of 189
 
The Ring programming language version 1.7 book - Part 16 of 196
The Ring programming language version 1.7 book - Part 16 of 196The Ring programming language version 1.7 book - Part 16 of 196
The Ring programming language version 1.7 book - Part 16 of 196
 
はじめてのGroovy
はじめてのGroovyはじめてのGroovy
はじめてのGroovy
 
Nosql hands on handout 04
Nosql hands on handout 04Nosql hands on handout 04
Nosql hands on handout 04
 
Dpilot Source Code With ScreenShots
Dpilot Source Code With ScreenShots Dpilot Source Code With ScreenShots
Dpilot Source Code With ScreenShots
 
Source Code for Dpilot
Source Code for Dpilot Source Code for Dpilot
Source Code for Dpilot
 
Building Real Time Systems on MongoDB Using the Oplog at Stripe
Building Real Time Systems on MongoDB Using the Oplog at StripeBuilding Real Time Systems on MongoDB Using the Oplog at Stripe
Building Real Time Systems on MongoDB Using the Oplog at Stripe
 
Jggug 2010 330 Grails 1.3 観察
Jggug 2010 330 Grails 1.3 観察Jggug 2010 330 Grails 1.3 観察
Jggug 2010 330 Grails 1.3 観察
 
AJUG April 2011 Cascading example
AJUG April 2011 Cascading exampleAJUG April 2011 Cascading example
AJUG April 2011 Cascading example
 
Using Arbor/ RGraph JS libaries for Data Visualisation
Using Arbor/ RGraph JS libaries for Data VisualisationUsing Arbor/ RGraph JS libaries for Data Visualisation
Using Arbor/ RGraph JS libaries for Data Visualisation
 
Mobile Web 5.0
Mobile Web 5.0Mobile Web 5.0
Mobile Web 5.0
 
Mongoskin - Guilin
Mongoskin - GuilinMongoskin - Guilin
Mongoskin - Guilin
 
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf MilanFrom Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
From Java to Kotlin beyond alt+shift+cmd+k - Kotlin Community Conf Milan
 
The Ring programming language version 1.5.4 book - Part 40 of 185
The Ring programming language version 1.5.4 book - Part 40 of 185The Ring programming language version 1.5.4 book - Part 40 of 185
The Ring programming language version 1.5.4 book - Part 40 of 185
 
D-Talk: What's awesome about Ruby 2.x and Rails 4
D-Talk: What's awesome about Ruby 2.x and Rails 4D-Talk: What's awesome about Ruby 2.x and Rails 4
D-Talk: What's awesome about Ruby 2.x and Rails 4
 
The Ring programming language version 1.5.1 book - Part 65 of 180
The Ring programming language version 1.5.1 book - Part 65 of 180The Ring programming language version 1.5.1 book - Part 65 of 180
The Ring programming language version 1.5.1 book - Part 65 of 180
 

Ähnlich wie Kamil Chmielewski, Jacek Juraszek - "Hadoop. W poszukiwaniu złotego młotka."

Store and Process Big Data with Hadoop and Cassandra
Store and Process Big Data with Hadoop and CassandraStore and Process Big Data with Hadoop and Cassandra
Store and Process Big Data with Hadoop and Cassandra
Deependra Ariyadewa
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
Dmitry Buzdin
 
From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)
Night Sailer
 

Ähnlich wie Kamil Chmielewski, Jacek Juraszek - "Hadoop. W poszukiwaniu złotego młotka." (20)

TypeScript Introduction
TypeScript IntroductionTypeScript Introduction
TypeScript Introduction
 
Store and Process Big Data with Hadoop and Cassandra
Store and Process Big Data with Hadoop and CassandraStore and Process Big Data with Hadoop and Cassandra
Store and Process Big Data with Hadoop and Cassandra
 
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemWprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
 
H base programming
H base programmingH base programming
H base programming
 
ES6 patterns in the wild
ES6 patterns in the wildES6 patterns in the wild
ES6 patterns in the wild
 
Clean coding-practices
Clean coding-practicesClean coding-practices
Clean coding-practices
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
Anti patterns
Anti patternsAnti patterns
Anti patterns
 
WOTC_Import
WOTC_ImportWOTC_Import
WOTC_Import
 
mobl
moblmobl
mobl
 
code for quiz in my sql
code for quiz  in my sql code for quiz  in my sql
code for quiz in my sql
 
ddd+scala
ddd+scaladdd+scala
ddd+scala
 
Refactoring to Macros with Clojure
Refactoring to Macros with ClojureRefactoring to Macros with Clojure
Refactoring to Macros with Clojure
 
How te bring common UI patterns to ADF
How te bring common UI patterns to ADFHow te bring common UI patterns to ADF
How te bring common UI patterns to ADF
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in Cassandra
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUG
 
Martin Fowler's Refactoring Techniques Quick Reference
Martin Fowler's Refactoring Techniques Quick ReferenceMartin Fowler's Refactoring Techniques Quick Reference
Martin Fowler's Refactoring Techniques Quick Reference
 
Ac2
Ac2Ac2
Ac2
 
From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)From mysql to MongoDB(MongoDB2011北京交流会)
From mysql to MongoDB(MongoDB2011北京交流会)
 

Kürzlich hochgeladen

Call Girls In Warangal Escorts ☎️7427069034 🔝 💃 Enjoy 24/7 Escort Service En...
Call Girls In Warangal Escorts ☎️7427069034  🔝 💃 Enjoy 24/7 Escort Service En...Call Girls In Warangal Escorts ☎️7427069034  🔝 💃 Enjoy 24/7 Escort Service En...
Call Girls In Warangal Escorts ☎️7427069034 🔝 💃 Enjoy 24/7 Escort Service En...
HyderabadDolls
 
Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)
Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)
Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)
amitlee9823
 

Kürzlich hochgeladen (20)

📞 Contact Number 8617370543VIP Rajsamand Call Girls
📞 Contact Number 8617370543VIP Rajsamand Call Girls📞 Contact Number 8617370543VIP Rajsamand Call Girls
📞 Contact Number 8617370543VIP Rajsamand Call Girls
 
Almora call girls 📞 8617697112 At Low Cost Cash Payment Booking
Almora call girls 📞 8617697112 At Low Cost Cash Payment BookingAlmora call girls 📞 8617697112 At Low Cost Cash Payment Booking
Almora call girls 📞 8617697112 At Low Cost Cash Payment Booking
 
Mumbai ] Call Girls Service Mumbai ₹7.5k Pick Up & Drop With Cash Payment 983...
Mumbai ] Call Girls Service Mumbai ₹7.5k Pick Up & Drop With Cash Payment 983...Mumbai ] Call Girls Service Mumbai ₹7.5k Pick Up & Drop With Cash Payment 983...
Mumbai ] Call Girls Service Mumbai ₹7.5k Pick Up & Drop With Cash Payment 983...
 
Call Girls In Warangal Escorts ☎️7427069034 🔝 💃 Enjoy 24/7 Escort Service En...
Call Girls In Warangal Escorts ☎️7427069034  🔝 💃 Enjoy 24/7 Escort Service En...Call Girls In Warangal Escorts ☎️7427069034  🔝 💃 Enjoy 24/7 Escort Service En...
Call Girls In Warangal Escorts ☎️7427069034 🔝 💃 Enjoy 24/7 Escort Service En...
 
𓀤Call On 6297143586 𓀤 Park Street Call Girls In All Kolkata 24/7 Provide Call...
𓀤Call On 6297143586 𓀤 Park Street Call Girls In All Kolkata 24/7 Provide Call...𓀤Call On 6297143586 𓀤 Park Street Call Girls In All Kolkata 24/7 Provide Call...
𓀤Call On 6297143586 𓀤 Park Street Call Girls In All Kolkata 24/7 Provide Call...
 
Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)
Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)
Call Girls Bellandur ☎ 7737669865☎ Book Your One night Stand (Bangalore)
 
Model Call Girls In Pazhavanthangal WhatsApp Booking 7427069034 call girl ser...
Model Call Girls In Pazhavanthangal WhatsApp Booking 7427069034 call girl ser...Model Call Girls In Pazhavanthangal WhatsApp Booking 7427069034 call girl ser...
Model Call Girls In Pazhavanthangal WhatsApp Booking 7427069034 call girl ser...
 
Ranikhet call girls 📞 8617697112 At Low Cost Cash Payment Booking
Ranikhet call girls 📞 8617697112 At Low Cost Cash Payment BookingRanikhet call girls 📞 8617697112 At Low Cost Cash Payment Booking
Ranikhet call girls 📞 8617697112 At Low Cost Cash Payment Booking
 
𓀤Call On 6297143586 𓀤 Sonagachi Call Girls In All Kolkata 24/7 Provide Call W...
𓀤Call On 6297143586 𓀤 Sonagachi Call Girls In All Kolkata 24/7 Provide Call W...𓀤Call On 6297143586 𓀤 Sonagachi Call Girls In All Kolkata 24/7 Provide Call W...
𓀤Call On 6297143586 𓀤 Sonagachi Call Girls In All Kolkata 24/7 Provide Call W...
 
(Verified Models) Airport Kolkata Escorts Service (+916297143586) Escort agen...
(Verified Models) Airport Kolkata Escorts Service (+916297143586) Escort agen...(Verified Models) Airport Kolkata Escorts Service (+916297143586) Escort agen...
(Verified Models) Airport Kolkata Escorts Service (+916297143586) Escort agen...
 
Thane West \ Escort Service in Mumbai - 450+ Call Girl Cash Payment 983332523...
Thane West \ Escort Service in Mumbai - 450+ Call Girl Cash Payment 983332523...Thane West \ Escort Service in Mumbai - 450+ Call Girl Cash Payment 983332523...
Thane West \ Escort Service in Mumbai - 450+ Call Girl Cash Payment 983332523...
 
❤Personal Whatsapp Number Keylong Call Girls 8617697112 💦✅.
❤Personal Whatsapp Number Keylong Call Girls 8617697112 💦✅.❤Personal Whatsapp Number Keylong Call Girls 8617697112 💦✅.
❤Personal Whatsapp Number Keylong Call Girls 8617697112 💦✅.
 
Verified Trusted Call Girls Singaperumal Koil Chennai ✔✔7427069034 Independe...
Verified Trusted Call Girls Singaperumal Koil Chennai ✔✔7427069034  Independe...Verified Trusted Call Girls Singaperumal Koil Chennai ✔✔7427069034  Independe...
Verified Trusted Call Girls Singaperumal Koil Chennai ✔✔7427069034 Independe...
 
Science City Kolkata ( Call Girls ) Kolkata ✔ 6297143586 ✔ Hot Model With Sex...
Science City Kolkata ( Call Girls ) Kolkata ✔ 6297143586 ✔ Hot Model With Sex...Science City Kolkata ( Call Girls ) Kolkata ✔ 6297143586 ✔ Hot Model With Sex...
Science City Kolkata ( Call Girls ) Kolkata ✔ 6297143586 ✔ Hot Model With Sex...
 
Low Rate Call Girls Dhakuria (8005736733) 100% GENUINE ESCORT SERVICE & HOTEL...
Low Rate Call Girls Dhakuria (8005736733) 100% GENUINE ESCORT SERVICE & HOTEL...Low Rate Call Girls Dhakuria (8005736733) 100% GENUINE ESCORT SERVICE & HOTEL...
Low Rate Call Girls Dhakuria (8005736733) 100% GENUINE ESCORT SERVICE & HOTEL...
 
VIP ( Goa Call Girls ) Margao Beach👉 8617370543 Escort Service Enjoy Your Dre...
VIP ( Goa Call Girls ) Margao Beach👉 8617370543 Escort Service Enjoy Your Dre...VIP ( Goa Call Girls ) Margao Beach👉 8617370543 Escort Service Enjoy Your Dre...
VIP ( Goa Call Girls ) Margao Beach👉 8617370543 Escort Service Enjoy Your Dre...
 
VIP Model Call Girls Budhwar Peth ( Pune ) Call ON 8005736733 Starting From 5...
VIP Model Call Girls Budhwar Peth ( Pune ) Call ON 8005736733 Starting From 5...VIP Model Call Girls Budhwar Peth ( Pune ) Call ON 8005736733 Starting From 5...
VIP Model Call Girls Budhwar Peth ( Pune ) Call ON 8005736733 Starting From 5...
 
Model VVIP Call Girls In Porur 👉 Chennai 🍬 7427069034 Escort Service & Hotel ...
Model VVIP Call Girls In Porur 👉 Chennai 🍬 7427069034 Escort Service & Hotel ...Model VVIP Call Girls In Porur 👉 Chennai 🍬 7427069034 Escort Service & Hotel ...
Model VVIP Call Girls In Porur 👉 Chennai 🍬 7427069034 Escort Service & Hotel ...
 
Kanpur call girls 📞 8617697112 At Low Cost Cash Payment Booking
Kanpur call girls 📞 8617697112 At Low Cost Cash Payment BookingKanpur call girls 📞 8617697112 At Low Cost Cash Payment Booking
Kanpur call girls 📞 8617697112 At Low Cost Cash Payment Booking
 
📞 Contact Number 8617697112 VIP East Sikkim Call Girls
📞 Contact Number 8617697112 VIP East Sikkim Call Girls📞 Contact Number 8617697112 VIP East Sikkim Call Girls
📞 Contact Number 8617697112 VIP East Sikkim Call Girls
 

Kamil Chmielewski, Jacek Juraszek - "Hadoop. W poszukiwaniu złotego młotka."

  • 1. Kamil Chmielewski Jacek Juraszek Hadoop w poszukiwaniu złotego młotka
  • 2. Źródło: IDC's Digital Universe Study, sponsored by EMC, June 2011
  • 3. • Facebook – 30 PB (2011) • 2000 serwerów • 22 400 rdzeni • 64 TB RAM • Yahoo – 14 PB (2010) • 4000 serwerów • Ebay – 5,3 PB • 532 serwery • 4256 rdzeni • Google – 24 PB ???
  • 4. Wzrost mocy obliczeniowej Źródło: The Free Lunch Is Over, Herb Sutter
  • 5.
  • 7. HDFS File System Shell • hadoop fs -cat file:///file3 /user/hadoop/file4 • hadoop fs -cp /user/hadoop/file1 /user/hadoop/file2 • hadoop fs -du /user/hadoop/dir1 • hadoop fs -get hdfs://nn.example.com/user/hadoop/file localfile • hadoop fs -ls /user/hadoop/file1 • hadoop fs -mkdir hdfs://nn1.example.com/user/hadoop/dir • hadoop fs -mv /user/hadoop/file1 /user/hadoop/file2 • hadoop fs -put localfile hdfs://nn.example.com/hadoop/hadoopfile • hadoop fs -rm hdfs://nn.example.com/file • hadoop fs -tail pathname
  • 10. Zrównoleglenie – MapReduce function map(String name, String document): // name: document name // document: document contents for each word w in document: emit (w, 1) function reduce(String word, Iterator partialCounts): // word: a word // partialCounts: a list of aggregated partial counts sum = 0 for each pc in partialCounts: sum += pc emit (word, sum) http://en.wikipedia.org/wiki/MapReduce
  • 11. MapReduce – Hadoop JAVA 63 linie !!! http://wiki.apache.org/hadoop/WordCount
  • 12. MapReduce – Apache PIG input_lines = LOAD '/tmp/my-copy-of-all-pages-on-internet' AS (line:chararray); -- Extract words from each line and put them into a pig bag -- datatype, then flatten the bag to get one word on each row words = FOREACH input_lines GENERATE FLATTEN(TOKENIZE(line)) AS word; -- filter out any words that are just white spaces filtered_words = FILTER words BY word MATCHES 'w+'; -- create a group for each word 7 linii dobrze, word_groups = GROUP filtered_words BY word; 63 źle -- count the entries in each group word_count = FOREACH word_groups GENERATE COUNT(filtered_words) AS count, group AS word; -- order the records by count ordered_word_count = ORDER word_count BY count DESC; STORE ordered_word_count INTO '/tmp/number-of-words-on-internet'; http://en.wikipedia.org/wiki/Pig_(programming_tool)
  • 13. Przykład z życia wzięty public static class MetricsMapper extends TableMapper<Text, IntWritable> { private final static Logger log = LoggerFactory.getLogger(MetricsMapper.class); protected void map(ImmutableBytesWritable key, Result value, private Map<String, String> getValuesFromQueryString(String query, DEFINE extractor pl.allegro.cm.pig.udf.specific.Extractor();Set<String> keys) { Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context DEFINE isNotBlank pl.allegro.cm.pig.udf.IsNotBlank(); context) throws IOException, String[] keyVal = split(query, '&'); InterruptedException { Map<String, String> result = new HashMap<String, String>(); DEFINE concat pl.allegro.cm.pig.udf.Concat(); String query = for (String s : keyVal) { Bytes.toString(value.getValue(RawDataFamily.CF_B, RawDataFamily.QUERY.getColumn() String[] kv = split(s, '='); in = LOAD 'events.$account' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('r:userId > 1) { )); if (keys.contains(kv[0]) && kv.length e:processId e:createTime result.put(kv[0], kv[1]); Map<String, String> infoTags = getValuesFromQuery(query, KEYS); r:query for (String key : KEYS) { r:direct e:newprocess', '-caster HBaseBinaryConverter') AS } (userId:chararray, processId:chararray, createTime:chararray, query:chararray, direct:chararray, newprocess:chararray } long eventTime = return result; ); toLong(value.getValue(EvalDataFamily.CF_B, EvalDataFamily.CREATE_TIME.getColumn() } )); rows = FILTER in BY (userId IS NOT NULL) AND (processId IS NOT private String (createTime { NOT NULL); NULL) AND key(String key) IS long eventTruncatedToDay = timestampToDay(eventTime); if (SOURCE.equals(key)) { rows = FOREACH rows GENERATE SUBSTRING(createTime,0,10) AS createTime, SOURCE; return userId, processId, query, direct, newprocess; rows = FILTER rows BY resolveTagValue(key, value, infoTags);'$upper' >= createTime; String tagValue = '$lower' <= createTime AND } else if (MEDIUM.equals(key)) { int visitCount = return MEDIUM; toInt(value.getValue(EvalDataFamily.CF_B, EvalDataFamily.VISIT_COUNT.getColumn()) } processs = GROUP rows BY (userId,processId); ); return key; processs = FOREACH processs GENERATE concat(group.$0,'|',group.$1) AS countId, COUNT($1) AS count; context.write(new Text(eventTruncatedToDay + KEY_DELIMITER + infoKey(key) } + KEY_DELIMITER + tagValue), new IntWritable( } visitCount)); public static class MetricsReducer extends TableReducer<Text, IntWritable, firstEvFromEachprocess = FILTER rows BY (newprocess IS NOT NULL); { } Writable> firstEvFromEachprocess = FOREACH firstEvFromEachprocess GENERATE createTime AS ct,key, Iterable<IntWritable> visitCounts, AS }; protected void reduce(Text concat(userId,'|',processId) private String resolveTagValue(String attr, Result Reducer<Text, IntWritable, Writable, Writable>.Context context) throws campId, extractor(query,direct) AS params; result, Map<String, String> allTags) { IOException, InterruptedException { String tagValue = allTags.get(attr); long visits = 0; tagValue = StringUtils.isBlank(tagValue) ? UNDEFINED : tagValue; long pv = 0; if (SOURCE.equals(attr)) { long bounces = 0; joinedData = JOIN firstEvFromEachprocess BY procId, processs BY countId; if (!UNDEFINED.equals(tagValue)) { for (IntWritable vc : visitCounts) { unpackParams = FOREACH joinedData GENERATE ct AS t, FLATTEN(params), count AS c, (count==1 ? 1 : 0) AS b; return tagValue; visits++; } pv += vc.get(); dataForWrite = GROUP unpackParams BY (t,$1,$2); bounces += vc.get() == 1 ? 1 : 0; String direct = dataForWrite = FOREACH dataForWrite GENERATE Bytes.toString(result.getValue(RawDataFamily.CF_B, RawDataFamily.DIRECT.getColumn } group.t, group.$1, group.$2, SUM(unpackParams.b),SUM(unpackParams.c), COUNT(unpackParams); ())); context.write( if (StringUtils.isNotBlank(direct)) { null, return retrieveOrigin(direct); new Put(Bytes.toBytes(key.toString())) STORE dataForWrite INTO 'metrics' USING } .add(Constants.CF_B, Constants.VISITS.getColumn(), toBytes(visits)) org.apache.pig.piggybank.storage.DBStorage('$driver','$url','$usr','$pass','INSERT INTO metrics return DIRECT; .add(Constants.CF_B, Constants.PV.getColumn(), toBytes(pv)) (account,else ifsource = resolveTagValue(SOURCE, result, allTags); } date,(MEDIUM.equals(attr)) { String key,value, cripled, events, processs) VALUES ("$account", ?, ?, ?, ?,Constants.BOUNCES.getColumn(), .add(Constants.CF_B, ?, ?) ON DUPLICATE KEY UPDATE cripled=VALUES(cripled), events=VALUES(events), processs=VALUES(processs)'); return source + VALUE_DELIMITER + tagValue; toBytes(bounces))); } }; return tagValue; } } private String retrieveHost(String url) { if (StringUtils.isNotBlank(url)) { try { A to jest PIG… return (new URL(url)).getHost().replaceFirst("www.", ""); } catch (MalformedURLException e) { log.warn("Malformed URL '{}'. Could not retrieve host value.", url);
  • 14. Hadoop + MongoDB HADOOP MongoDB Bach proc. result archive data online data MR Flushed data
  • 16. HBase key timestamp cf dane cf adres 80071223097 t3 miasto=Warszawa 80071223097 t2 miasto=Gdańsk 80071223097 t1 imie=Jan 86121267222 t2 ulica=Długa 86121267222 t1 imie=Maria miasto=Poznań
  • 17. HTable table = new HTable("osoby"); Put event = new Put(Bytes.toBytes("80071223097") .add(Bytes.toBytes("dane"), Bytes.toBytes("imie"), Bytes.toBytes("Jan")) .add(Bytes.toBytes("adres"), Bytes.toBytes("miasto"), Bytes.toBytes("Warszawa")) ; table.put(event); // https://github.com/nearinfinity/hbase-dsl HTable table = new HTable("osoby"); hBase.save(table).row("80071223097"). family("dane").col("imie", "Jan"). family("adres").col("miasto", "Warszawa"); # http://happybase.readthedocs.org/ table = connection.table('osoby') table.put('80071223097’, {'dane:imie': 'Jan', 'adres:miasto': 'Warszawa'})
  • 18. HBase Shell # Count rows in a table def _count_internal(interval = 1000, caching_rows = 10) # We can safely set scanner cachingwith the first key only filter scan = org.apache.hadoop.hbase.client.Scan.new scan.cache_blocks = false scan.caching = caching_rows scan.setFilter(org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter.new) # Run the scanner scanner = @table.getScanner(scan) count = 0 iter = scanner.iterator # Iterate results while iter.hasNext row = iter.next count += 1 next unless (block_given? && count % interval == 0) # Allow command modules to visualize counting process yield(count, String.from_java_bytes(row.getRow)) end # Return the counter return count end
  • 19. Koszmar pakietowy org.apache.hadoop.mapred org.apache.hadoop.mapreduce  Wszystko_mający  Przyjazne API  Status: legacy  Klasy bazowe  Chain_mr  Konteksty  Operacja JOIN na MR  Wsparcie dla CLI i CoC Smaczki z Maven Repo:  Przepakietowana GUAVA  Zależności do commons-logging  Dystrybucje tylko w 3rd party repo  HBASE z zależnościami do: jetty i servlet-api
  • 21. Przykładowa architektura systemu MR = Batch Bazy danych nadal nadają sens aplikacji
  • 22. Hadoop + SOLR = SOLR Cloud
  • 23. Nie każdy problem jest dość duży… FACEBOOK CLUSTER  2k maszyn  12 TB per maszyna  30 PB całkowitej pojemności  1200 maszyn x 8 core  800 maszyn X 16 core
  • 24. Zastosowania • Indeksowanie dokumentów • Analiza wykorzystania serwisów internetowych • Logi serwerów, firewalli • Repozytoria obrazów, filmów • Metryki parametrów systemów • Systemy rekomendacji