hadoop WordCount 예제

Notice

Recent Posts

Recent Comments

Link

« 2025/03 »
일	월	화	수	목	금	토
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30	31

Tags more

Archives

Today

Total

관리 메뉴

DBILITY

hadoop WordCount 예제 본문

bigdata/hadoop

hadoop WordCount 예제

DBILITY 2016. 9. 9. 14:09

728x90

maven dependency 설정
pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.dbility.hadoop</groupId>
    <artifactId>example02</artifactId>
    <version>0.0.1</version>
    <properties>
        <java.version>1.7</java.version>
        <hadoop.version>1.2.1</hadoop.version>
        <slf4j.version>1.4.3</slf4j.version>
        <scp.user>hadoop</scp.user>
        <scp.password>hadoop</scp.password>
        <scp.host>big-master</scp.host>
        <scp.copy2dir>home/hadoop/</scp.copy2dir>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>${slf4j.version}</version>
        </dependency>
    </dependencies>
    <build>
        <finalName>${project.artifactId}</finalName>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
                <configuration>
                    <source>${java.version}</source>
                    <target>${java.version}</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-antrun-plugin</artifactId>
                <version>1.8</version>
                <dependencies>
                    <dependency>
                        <groupId>org.apache.ant</groupId>
                        <artifactId>ant-jsch</artifactId>
                        <version>1.9.7</version>
                    </dependency>
                    <dependency>
                        <groupId>com.jcraft</groupId>
                        <artifactId>jsch</artifactId>
                        <version>0.1.54</version>
                    </dependency>
                </dependencies>
                <executions>
                    <execution>
                        <phase>install</phase>
                        <goals>
                            <goal>run</goal>
                        </goals>
                        <configuration>
                            <tasks>
                                <scp file="${project.basedir}/target/${project.artifactId}.jar" todir="${scp.user}:${scp.password}@${scp.host}:/${scp.copy2dir}" trust="true"></scp>
                            </tasks>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

매퍼 클래스 구현
WordCountMapper.java

package com.dbility.hadoop;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WordCountMapper extends
		Mapper<LongWritable, Text, Text, IntWritable> {

	private static final Logger LOG = LoggerFactory.getLogger(WordCountMapper.class);

	private static final IntWritable one = new IntWritable(1);
	private Text word = new Text();

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {

		LOG.info("key : {}",key);
		LOG.info("value : {}",value);

		StringTokenizer stk = new StringTokenizer(value.toString());

		while (stk.hasMoreTokens()) {
			String str = stk.nextToken();
			LOG.info("stk.nextToken() : {}",str);
			word.set(str);
			context.write(word, one);
		} //end while
	} //end map
} //end class

리듀서 클래스 구현
WordCountReducer.java

package com.dbility.hadoop;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WordCountReducer extends
		Reducer<Text, IntWritable, Text, IntWritable> {

	private static final Logger LOG = LoggerFactory
			.getLogger(WordCountReducer.class);

	private IntWritable result = new IntWritable();

	@Override
	protected void reduce(Text key, Iterable<IntWritable> values,
			Context context) throws IOException, InterruptedException {

		LOG.info("key : {}", key);
		LOG.info("values : {}", values);

		int sum = 0;
		for (IntWritable value : values) {
			sum += value.get();
		} // end foreach
		result.set(sum);

		LOG.info("result : {}", result);

		context.write(key, result);
	} // end reduce
} // end class

드라이버 클래스 구현
WordCount.java

package com.dbility.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WordCount {

	private static final Logger LOG = LoggerFactory.getLogger(WordCount.class);

	public static void main(String[] args) throws Exception {

		LOG.info("args : {}", args);

		if (args.length != 2) {
			LOG.error("Usage : WordCount <input> <output>");
			Runtime.getRuntime().exit(2);
		} // end if

		Configuration conf = new Configuration();

		Job job = new Job(conf, "WordCount");

		job.setJarByClass(WordCount.class);
		job.setMapperClass(WordCountMapper.class);
		job.setReducerClass(WordCountReducer.class);

		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		Runtime.getRuntime().exit( job.waitForCompletion(true) ? 0 : 1 );

	} // end main

} // end class

빌드 후 실행 및 결과

hadoop@big-master ~]$ /hadoop/bin/hadoop jar /home/hadoop/example02.jar com.dbility.hadoop.WordCount input.txt wordcount_output
16/09/09 12:07:04 INFO hadoop.WordCount: args : input.txt
16/09/09 12:07:05 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
16/09/09 12:07:05 INFO input.FileInputFormat: Total input paths to process : 1
16/09/09 12:07:05 INFO util.NativeCodeLoader: Loaded the native-hadoop library
16/09/09 12:07:05 WARN snappy.LoadSnappy: Snappy native library not loaded
16/09/09 12:07:05 INFO mapred.JobClient: Running job: job_201609091152_0003
16/09/09 12:07:06 INFO mapred.JobClient:  map 0% reduce 0%
16/09/09 12:07:10 INFO mapred.JobClient:  map 100% reduce 0%
16/09/09 12:07:19 INFO mapred.JobClient:  map 100% reduce 100%
16/09/09 12:07:19 INFO mapred.JobClient: Job complete: job_201609091152_0003
16/09/09 12:07:19 INFO mapred.JobClient: Counters: 29
16/09/09 12:07:19 INFO mapred.JobClient:   Job Counters
16/09/09 12:07:19 INFO mapred.JobClient:     Launched reduce tasks=1
16/09/09 12:07:19 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=4212
16/09/09 12:07:19 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
16/09/09 12:07:19 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
16/09/09 12:07:19 INFO mapred.JobClient:     Launched map tasks=1
16/09/09 12:07:19 INFO mapred.JobClient:     Data-local map tasks=1
16/09/09 12:07:19 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=8556
16/09/09 12:07:19 INFO mapred.JobClient:   File Output Format Counters
16/09/09 12:07:19 INFO mapred.JobClient:     Bytes Written=26
16/09/09 12:07:19 INFO mapred.JobClient:   FileSystemCounters
16/09/09 12:07:19 INFO mapred.JobClient:     FILE_BYTES_READ=67
16/09/09 12:07:19 INFO mapred.JobClient:     HDFS_BYTES_READ=134
16/09/09 12:07:19 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=110585
16/09/09 12:07:19 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=26
16/09/09 12:07:19 INFO mapred.JobClient:   File Input Format Counters
16/09/09 12:07:19 INFO mapred.JobClient:     Bytes Read=25
16/09/09 12:07:19 INFO mapred.JobClient:   Map-Reduce Framework
16/09/09 12:07:19 INFO mapred.JobClient:     Map output materialized bytes=67
16/09/09 12:07:19 INFO mapred.JobClient:     Map input records=2
16/09/09 12:07:19 INFO mapred.JobClient:     Reduce shuffle bytes=67
16/09/09 12:07:19 INFO mapred.JobClient:     Spilled Records=12
16/09/09 12:07:19 INFO mapred.JobClient:     Map output bytes=49
16/09/09 12:07:19 INFO mapred.JobClient:     Total committed heap usage (bytes)=300417024
16/09/09 12:07:19 INFO mapred.JobClient:     CPU time spent (ms)=1680
16/09/09 12:07:19 INFO mapred.JobClient:     Combine input records=0
16/09/09 12:07:19 INFO mapred.JobClient:     SPLIT_RAW_BYTES=109
16/09/09 12:07:19 INFO mapred.JobClient:     Reduce input records=6
16/09/09 12:07:19 INFO mapred.JobClient:     Reduce input groups=4
16/09/09 12:07:19 INFO mapred.JobClient:     Combine output records=0
16/09/09 12:07:19 INFO mapred.JobClient:     Physical memory (bytes) snapshot=304570368
16/09/09 12:07:19 INFO mapred.JobClient:     Reduce output records=4
16/09/09 12:07:19 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=1556480000
16/09/09 12:07:19 INFO mapred.JobClient:     Map output records=6

[hadoop@big-master ~]$ /hadoop/bin/hadoop fs -cat /user/hadoop/wordcount_output/part-r-00000
a       2
book    2
read    1
write   1

참고서적 : 시작하세요! 하둡프로그래밍 개정2판(위키북스) 정재화 지음

728x90

'bigdata > hadoop' 카테고리의 다른 글

hadoop window10 x64 eclipse에서 mapreduce debug (0)	2016.10.02
hadoop 1.2.1 eclipse plugin (0)	2016.09.28
hadoop MapReduce 개발 과정 정리 (0)	2016.09.09
hadoop MapReduce Data Flow (0)	2016.09.07
hadoop hdfs 파일저장 및 읽기 예제 (0)	2016.09.07

공유하기 링크

페이스북
카카오스토리
트위터

'bigdata/hadoop' Related Articles

Comments

DBILITY

hadoop WordCount 예제 본문

hadoop WordCount 예제

'bigdata > hadoop' 카테고리의 다른 글

티스토리툴바