The document discusses Hadoop and its applications. It provides examples of companies like Facebook and their use of Hadoop. It also discusses Hadoop components like HDFS, MapReduce, Pig and HBase. It provides examples of using Hadoop with databases like MongoDB and search engines like Solr. It notes that not every problem requires large-scale solutions and discusses potential use cases for Hadoop including log analysis, indexing documents and building recommendation systems.
10. Zrównoleglenie – MapReduce
function map(String name, String document):
// name: document name
// document: document contents
for each word w in document:
emit (w, 1)
function reduce(String word, Iterator partialCounts):
// word: a word
// partialCounts: a list of aggregated partial counts
sum = 0
for each pc in partialCounts:
sum += pc
emit (word, sum)
http://en.wikipedia.org/wiki/MapReduce
11. MapReduce – Hadoop JAVA
63 linie !!!
http://wiki.apache.org/hadoop/WordCount
12. MapReduce – Apache PIG
input_lines = LOAD '/tmp/my-copy-of-all-pages-on-internet'
AS (line:chararray);
-- Extract words from each line and put them into a pig bag
-- datatype, then flatten the bag to get one word on each row
words = FOREACH input_lines GENERATE FLATTEN(TOKENIZE(line)) AS word;
-- filter out any words that are just white spaces
filtered_words = FILTER words BY word MATCHES 'w+';
-- create a group for each word 7 linii dobrze,
word_groups = GROUP filtered_words BY word;
63 źle
-- count the entries in each group
word_count = FOREACH word_groups
GENERATE COUNT(filtered_words) AS count, group AS word;
-- order the records by count
ordered_word_count = ORDER word_count BY count DESC;
STORE ordered_word_count INTO '/tmp/number-of-words-on-internet';
http://en.wikipedia.org/wiki/Pig_(programming_tool)
13. Przykład z życia wzięty
public static class MetricsMapper extends TableMapper<Text, IntWritable> {
private final static Logger log =
LoggerFactory.getLogger(MetricsMapper.class);
protected void map(ImmutableBytesWritable key, Result value, private Map<String, String> getValuesFromQueryString(String query,
DEFINE extractor pl.allegro.cm.pig.udf.specific.Extractor();Set<String> keys) {
Mapper<ImmutableBytesWritable, Result, Text, IntWritable>.Context
DEFINE isNotBlank pl.allegro.cm.pig.udf.IsNotBlank();
context) throws IOException, String[] keyVal = split(query, '&');
InterruptedException { Map<String, String> result = new HashMap<String, String>();
DEFINE concat pl.allegro.cm.pig.udf.Concat();
String query = for (String s : keyVal) {
Bytes.toString(value.getValue(RawDataFamily.CF_B, RawDataFamily.QUERY.getColumn() String[] kv = split(s, '=');
in = LOAD 'events.$account' USING org.apache.pig.backend.hadoop.hbase.HBaseStorage('r:userId > 1) {
)); if (keys.contains(kv[0]) && kv.length e:processId e:createTime
result.put(kv[0], kv[1]);
Map<String, String> infoTags = getValuesFromQuery(query, KEYS);
r:query for (String key : KEYS) {
r:direct e:newprocess', '-caster HBaseBinaryConverter') AS }
(userId:chararray, processId:chararray, createTime:chararray, query:chararray, direct:chararray, newprocess:chararray
}
long eventTime = return result;
);
toLong(value.getValue(EvalDataFamily.CF_B, EvalDataFamily.CREATE_TIME.getColumn() }
));
rows = FILTER in BY (userId IS NOT NULL) AND (processId IS NOT private String (createTime { NOT NULL);
NULL) AND key(String key) IS
long eventTruncatedToDay = timestampToDay(eventTime); if (SOURCE.equals(key)) {
rows = FOREACH rows GENERATE SUBSTRING(createTime,0,10) AS createTime, SOURCE; return userId, processId, query, direct, newprocess;
rows = FILTER rows BY resolveTagValue(key, value, infoTags);'$upper' >= createTime;
String tagValue = '$lower' <= createTime AND } else if (MEDIUM.equals(key)) {
int visitCount = return MEDIUM;
toInt(value.getValue(EvalDataFamily.CF_B, EvalDataFamily.VISIT_COUNT.getColumn()) }
processs = GROUP rows BY (userId,processId);
); return key;
processs = FOREACH processs GENERATE concat(group.$0,'|',group.$1) AS countId, COUNT($1) AS count;
context.write(new Text(eventTruncatedToDay + KEY_DELIMITER + infoKey(key) }
+ KEY_DELIMITER + tagValue), new IntWritable( }
visitCount)); public static class MetricsReducer extends TableReducer<Text, IntWritable,
firstEvFromEachprocess = FILTER rows BY (newprocess IS NOT NULL); {
} Writable>
firstEvFromEachprocess = FOREACH firstEvFromEachprocess GENERATE createTime AS ct,key, Iterable<IntWritable> visitCounts, AS
}; protected void reduce(Text concat(userId,'|',processId)
private String resolveTagValue(String attr, Result Reducer<Text, IntWritable, Writable, Writable>.Context context) throws
campId, extractor(query,direct) AS params;
result, Map<String, String> allTags) { IOException, InterruptedException {
String tagValue = allTags.get(attr); long visits = 0;
tagValue = StringUtils.isBlank(tagValue) ? UNDEFINED : tagValue; long pv = 0;
if (SOURCE.equals(attr)) { long bounces = 0;
joinedData = JOIN firstEvFromEachprocess BY procId, processs BY countId;
if (!UNDEFINED.equals(tagValue)) { for (IntWritable vc : visitCounts) {
unpackParams = FOREACH joinedData GENERATE ct AS t, FLATTEN(params), count AS c, (count==1 ? 1 : 0) AS b;
return tagValue; visits++;
} pv += vc.get();
dataForWrite = GROUP unpackParams BY (t,$1,$2); bounces += vc.get() == 1 ? 1 : 0;
String direct =
dataForWrite = FOREACH dataForWrite GENERATE
Bytes.toString(result.getValue(RawDataFamily.CF_B, RawDataFamily.DIRECT.getColumn }
group.t, group.$1, group.$2, SUM(unpackParams.b),SUM(unpackParams.c), COUNT(unpackParams);
())); context.write(
if (StringUtils.isNotBlank(direct)) { null,
return retrieveOrigin(direct); new Put(Bytes.toBytes(key.toString()))
STORE dataForWrite INTO 'metrics' USING
} .add(Constants.CF_B, Constants.VISITS.getColumn(),
toBytes(visits))
org.apache.pig.piggybank.storage.DBStorage('$driver','$url','$usr','$pass','INSERT INTO metrics
return DIRECT;
.add(Constants.CF_B, Constants.PV.getColumn(), toBytes(pv))
(account,else ifsource = resolveTagValue(SOURCE, result, allTags);
}
date,(MEDIUM.equals(attr)) {
String key,value, cripled, events, processs) VALUES ("$account", ?, ?, ?, ?,Constants.BOUNCES.getColumn(),
.add(Constants.CF_B, ?, ?) ON DUPLICATE KEY UPDATE
cripled=VALUES(cripled), events=VALUES(events), processs=VALUES(processs)');
return source + VALUE_DELIMITER + tagValue; toBytes(bounces)));
} };
return tagValue; }
}
private String retrieveHost(String url) {
if (StringUtils.isNotBlank(url)) {
try {
A to jest PIG…
return (new URL(url)).getHost().replaceFirst("www.", "");
} catch (MalformedURLException e) {
log.warn("Malformed URL '{}'. Could not retrieve host value.", url);
14. Hadoop + MongoDB
HADOOP
MongoDB Bach proc. result
archive data
online data
MR
Flushed data
18. HBase Shell
# Count rows in a table
def _count_internal(interval = 1000, caching_rows = 10)
# We can safely set scanner cachingwith the first key only filter
scan = org.apache.hadoop.hbase.client.Scan.new
scan.cache_blocks = false
scan.caching = caching_rows
scan.setFilter(org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter.new)
# Run the scanner
scanner = @table.getScanner(scan)
count = 0
iter = scanner.iterator
# Iterate results
while iter.hasNext
row = iter.next
count += 1
next unless (block_given? && count % interval == 0)
# Allow command modules to visualize counting process
yield(count, String.from_java_bytes(row.getRow))
end
# Return the counter
return count
end
19. Koszmar pakietowy
org.apache.hadoop.mapred org.apache.hadoop.mapreduce
Wszystko_mający Przyjazne API
Status: legacy Klasy bazowe
Chain_mr Konteksty
Operacja JOIN na MR Wsparcie dla CLI i CoC
Smaczki z Maven Repo:
Przepakietowana GUAVA
Zależności do commons-logging
Dystrybucje tylko w 3rd party repo
HBASE z zależnościami do: jetty i servlet-api
23. Nie każdy problem
jest dość duży…
FACEBOOK CLUSTER
2k maszyn
12 TB per maszyna
30 PB całkowitej pojemności
1200 maszyn x 8 core
800 maszyn X 16 core
24. Zastosowania
• Indeksowanie dokumentów
• Analiza wykorzystania serwisów internetowych
• Logi serwerów, firewalli
• Repozytoria obrazów, filmów
• Metryki parametrów systemów
• Systemy rekomendacji