DBILITY

hadoop WordCount 예제 본문

bigdata/hadoop

hadoop WordCount 예제

DBILITY 2016. 9. 9. 14:09
반응형
  1. maven dependency 설정
    pom.xml
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
        <groupId>com.dbility.hadoop</groupId>
        <artifactId>example02</artifactId>
        <version>0.0.1</version>
        <properties>
            <java.version>1.7</java.version>
            <hadoop.version>1.2.1</hadoop.version>
            <slf4j.version>1.4.3</slf4j.version>
            <scp.user>hadoop</scp.user>
            <scp.password>hadoop</scp.password>
            <scp.host>big-master</scp.host>
            <scp.copy2dir>home/hadoop/</scp.copy2dir>
        </properties>
        <dependencies>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-core</artifactId>
                <version>${hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-api</artifactId>
                <version>${slf4j.version}</version>
            </dependency>
        </dependencies>
        <build>
            <finalName>${project.artifactId}</finalName>
            <plugins>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-compiler-plugin</artifactId>
                    <version>3.5.1</version>
                    <configuration>
                        <source>${java.version}</source>
                        <target>${java.version}</target>
                    </configuration>
                </plugin>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-antrun-plugin</artifactId>
                    <version>1.8</version>
                    <dependencies>
                        <dependency>
                            <groupId>org.apache.ant</groupId>
                            <artifactId>ant-jsch</artifactId>
                            <version>1.9.7</version>
                        </dependency>
                        <dependency>
                            <groupId>com.jcraft</groupId>
                            <artifactId>jsch</artifactId>
                            <version>0.1.54</version>
                        </dependency>
                    </dependencies>
                    <executions>
                        <execution>
                            <phase>install</phase>
                            <goals>
                                <goal>run</goal>
                            </goals>
                            <configuration>
                                <tasks>
                                    <scp file="${project.basedir}/target/${project.artifactId}.jar" todir="${scp.user}:${scp.password}@${scp.host}:/${scp.copy2dir}" trust="true"></scp>
                                </tasks>
                            </configuration>
                        </execution>
                    </executions>
                </plugin>
            </plugins>
        </build>
    </project>
  2. 매퍼 클래스 구현
    WordCountMapper.java
    package com.dbility.hadoop;
    
    import java.io.IOException;
    import java.util.StringTokenizer;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    public class WordCountMapper extends
    		Mapper<LongWritable, Text, Text, IntWritable> {
    
    	private static final Logger LOG = LoggerFactory.getLogger(WordCountMapper.class);
    
    	private static final IntWritable one = new IntWritable(1);
    	private Text word = new Text();
    
    	@Override
    	protected void map(LongWritable key, Text value, Context context)
    			throws IOException, InterruptedException {
    
    		LOG.info("key : {}",key);
    		LOG.info("value : {}",value);
    
    		StringTokenizer stk = new StringTokenizer(value.toString());
    
    		while (stk.hasMoreTokens()) {
    			String str = stk.nextToken();
    			LOG.info("stk.nextToken() : {}",str);
    			word.set(str);
    			context.write(word, one);
    		} //end while
    	} //end map
    } //end class
  3. 리듀서 클래스 구현
    WordCountReducer.java
    package com.dbility.hadoop;
    
    import java.io.IOException;
    
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    public class WordCountReducer extends
    		Reducer<Text, IntWritable, Text, IntWritable> {
    
    	private static final Logger LOG = LoggerFactory
    			.getLogger(WordCountReducer.class);
    
    	private IntWritable result = new IntWritable();
    
    	@Override
    	protected void reduce(Text key, Iterable<IntWritable> values,
    			Context context) throws IOException, InterruptedException {
    
    		LOG.info("key : {}", key);
    		LOG.info("values : {}", values);
    
    		int sum = 0;
    		for (IntWritable value : values) {
    			sum += value.get();
    		} // end foreach
    		result.set(sum);
    
    		LOG.info("result : {}", result);
    
    		context.write(key, result);
    	} // end reduce
    } // end class
  4. 드라이버 클래스 구현
    WordCount.java
    package com.dbility.hadoop;
    
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    public class WordCount {
    
    	private static final Logger LOG = LoggerFactory.getLogger(WordCount.class);
    
    	public static void main(String[] args) throws Exception {
    
    		LOG.info("args : {}", args);
    
    		if (args.length != 2) {
    			LOG.error("Usage : WordCount <input> <output>");
    			Runtime.getRuntime().exit(2);
    		} // end if
    
    		Configuration conf = new Configuration();
    
    		Job job = new Job(conf, "WordCount");
    
    		job.setJarByClass(WordCount.class);
    		job.setMapperClass(WordCountMapper.class);
    		job.setReducerClass(WordCountReducer.class);
    
    		job.setInputFormatClass(TextInputFormat.class);
    		job.setOutputFormatClass(TextOutputFormat.class);
    
    		job.setOutputKeyClass(Text.class);
    		job.setOutputValueClass(IntWritable.class);
    
    		FileInputFormat.addInputPath(job, new Path(args[0]));
    		FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
    		Runtime.getRuntime().exit( job.waitForCompletion(true) ? 0 : 1 );
    
    	} // end main
    
    } // end class
  5. 빌드 후 실행 및 결과
    hadoop@big-master ~]$ /hadoop/bin/hadoop jar /home/hadoop/example02.jar com.dbility.hadoop.WordCount input.txt wordcount_output
    16/09/09 12:07:04 INFO hadoop.WordCount: args : input.txt
    16/09/09 12:07:05 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
    16/09/09 12:07:05 INFO input.FileInputFormat: Total input paths to process : 1
    16/09/09 12:07:05 INFO util.NativeCodeLoader: Loaded the native-hadoop library
    16/09/09 12:07:05 WARN snappy.LoadSnappy: Snappy native library not loaded
    16/09/09 12:07:05 INFO mapred.JobClient: Running job: job_201609091152_0003
    16/09/09 12:07:06 INFO mapred.JobClient:  map 0% reduce 0%
    16/09/09 12:07:10 INFO mapred.JobClient:  map 100% reduce 0%
    16/09/09 12:07:19 INFO mapred.JobClient:  map 100% reduce 100%
    16/09/09 12:07:19 INFO mapred.JobClient: Job complete: job_201609091152_0003
    16/09/09 12:07:19 INFO mapred.JobClient: Counters: 29
    16/09/09 12:07:19 INFO mapred.JobClient:   Job Counters
    16/09/09 12:07:19 INFO mapred.JobClient:     Launched reduce tasks=1
    16/09/09 12:07:19 INFO mapred.JobClient:     SLOTS_MILLIS_MAPS=4212
    16/09/09 12:07:19 INFO mapred.JobClient:     Total time spent by all reduces waiting after reserving slots (ms)=0
    16/09/09 12:07:19 INFO mapred.JobClient:     Total time spent by all maps waiting after reserving slots (ms)=0
    16/09/09 12:07:19 INFO mapred.JobClient:     Launched map tasks=1
    16/09/09 12:07:19 INFO mapred.JobClient:     Data-local map tasks=1
    16/09/09 12:07:19 INFO mapred.JobClient:     SLOTS_MILLIS_REDUCES=8556
    16/09/09 12:07:19 INFO mapred.JobClient:   File Output Format Counters
    16/09/09 12:07:19 INFO mapred.JobClient:     Bytes Written=26
    16/09/09 12:07:19 INFO mapred.JobClient:   FileSystemCounters
    16/09/09 12:07:19 INFO mapred.JobClient:     FILE_BYTES_READ=67
    16/09/09 12:07:19 INFO mapred.JobClient:     HDFS_BYTES_READ=134
    16/09/09 12:07:19 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=110585
    16/09/09 12:07:19 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=26
    16/09/09 12:07:19 INFO mapred.JobClient:   File Input Format Counters
    16/09/09 12:07:19 INFO mapred.JobClient:     Bytes Read=25
    16/09/09 12:07:19 INFO mapred.JobClient:   Map-Reduce Framework
    16/09/09 12:07:19 INFO mapred.JobClient:     Map output materialized bytes=67
    16/09/09 12:07:19 INFO mapred.JobClient:     Map input records=2
    16/09/09 12:07:19 INFO mapred.JobClient:     Reduce shuffle bytes=67
    16/09/09 12:07:19 INFO mapred.JobClient:     Spilled Records=12
    16/09/09 12:07:19 INFO mapred.JobClient:     Map output bytes=49
    16/09/09 12:07:19 INFO mapred.JobClient:     Total committed heap usage (bytes)=300417024
    16/09/09 12:07:19 INFO mapred.JobClient:     CPU time spent (ms)=1680
    16/09/09 12:07:19 INFO mapred.JobClient:     Combine input records=0
    16/09/09 12:07:19 INFO mapred.JobClient:     SPLIT_RAW_BYTES=109
    16/09/09 12:07:19 INFO mapred.JobClient:     Reduce input records=6
    16/09/09 12:07:19 INFO mapred.JobClient:     Reduce input groups=4
    16/09/09 12:07:19 INFO mapred.JobClient:     Combine output records=0
    16/09/09 12:07:19 INFO mapred.JobClient:     Physical memory (bytes) snapshot=304570368
    16/09/09 12:07:19 INFO mapred.JobClient:     Reduce output records=4
    16/09/09 12:07:19 INFO mapred.JobClient:     Virtual memory (bytes) snapshot=1556480000
    16/09/09 12:07:19 INFO mapred.JobClient:     Map output records=6
    
    [hadoop@big-master ~]$ /hadoop/bin/hadoop fs -cat /user/hadoop/wordcount_output/part-r-00000
    a       2
    book    2
    read    1
    write   1

참고서적 : 시작하세요! 하둡프로그래밍 개정2판(위키북스) 정재화 지음

반응형
Comments