Hadoop Word Count程序

网友投稿 264 2022-11-24

Hadoop Word Count程序

Hadoop Word Count程序

pom.xml文件:

    4.0.0modelVersion>     com.stonogroupId>     mr2artifactId>     0.0.1-SNAPSHOTversion>     jarpackaging>     mr2name>     http://maven.apache.orgurl>              UTF-8project.build.sourceEncoding>         1.8java.version>         UTF-8project.reporting.outputEncoding>         yyyy-MM-dd HH:mm:ssmaven.build.timestamp.format>                  2.7.2hadoop-mapreduce-client.version>         1.1.2hbase-client.version>         1.7.25slf4j.version>         0.10.2.1kafka-client.version>     properties>                                jdk.toolsgroupId>             jdk.toolsartifactId>             1.8version>             systemscope>             ${JAVA_HOME}/lib/tools.jarsystemPath>         dependency>          日志记录 Slf4j -->                      org.slf4jgroupId>             slf4j-apiartifactId>             ${slf4j.version}version>         dependency>          mapreduce -->                      org.apache.hadoopgroupId>             hadoop-commonartifactId>             ${hadoop-mapreduce-client.version}version>         dependency>                      org.apache.hadoopgroupId>             hadoop-mapreduce-client-coreartifactId>             ${hadoop-mapreduce-client.version}version>         dependency>                                        junitgroupId>             junitartifactId>             3.8.1version>             testscope>         dependency>     dependencies>                                            maven-compiler-pluginartifactId>                 2.3.2version>                                      1.8source>                     1.8target>                 configuration>             plugin>                               maven-assembly-plugin artifactId>                                                               jar-with-dependenciesdescriptorRef>                     descriptorRefs>                                                                           com.stono.basis.JobRunnermainClass>                         manifest>                     archive>                 configuration>                                                               make-assemblyid>                         packagephase>                                                      singlegoal>                         goals>                     execution>                 executions>             plugin>         plugins>     build>project>

map程序:

package com.test.basis;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Mapper;/*四个参数的含义 第一个参数:map中key-value的key的类型,默认值是输入行的偏移量 第二个参数:map中key-value的value的类型 在此需求中是某一行的内容(数据) 第三个参数:reduce中key-value中的key类型 第四个参数:redece的输出参数int 但是在mapreduce中涉及到了网络间的传输,所以需要序列化,而hadoop提供了相关的序列化类型 long-LongWritable String-Text int-IntWritable */public class MapperWordCount extends Mapper{         /*重写mapper的map方法 编写自己的逻辑      * key是偏移量不用管      * value是一行的内容 例:hello zhangsan you you       * context是返回结果     */     @Override    protected void map(LongWritable key, Text value,             Context context)            throws IOException, InterruptedException {                  String[] values=value.toString().split(" ");//对得到的一行数据进行切分 在此需求中是以空格进行切分                  for(String word:values){                          context.write(new Text(word), new IntWritable(1));//遍历数组 输出map的返回值 即                     }              }      }

Reduce程序:

package com.test.basis;import java.io.IOException;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Reducer;/*  * 此方法是WordCount的reduce  * 参数:1.map阶段返回的key类型String-Text  *         2.map阶段返回值中value的类型Int-IntWritable  *         3.reduce阶段返回值中key的类型String-Text  *         4.reduce阶段返回值中value的类型Int-IntWritable */public class ReducerWordCount extends Reducer{              /*      * 实现父类的reduce方法      *key是一组key-value的相同的哪个key      *values是一组key-value的所有value      *key value 的情况,比如*       * context 返回值,*/     @Override    protected void reduce(Text key, Iterable values,             Context context)throws IOException, InterruptedException {                     int count=0;//初始一个计数器                      for(IntWritable value:values){                 count ++;//对values进行遍历 每次加1            }             context.write(key,new IntWritable(count));//最后写返回值                                       }           }

JobRunner程序:

package com.test.basis;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.Job;public class JobRunner {         /*      * 提交写好的mapreduce程序 当做一个Job进行提交      *       */          public static void main(String[] args) throws Exception {        //读取classpath下的所有xxx-site.xml配置文件,并进行解析         Configuration conf=new Configuration();

FileSystem fs = FileSystem.get(conf);          String s = "/user/weblogic/mr/output/";          Path p = new Path(s);          fs.delete(p,true); // 先把输出目录删了;否则重复运行会报错;

Job wcjob=Job.getInstance(conf);//初始一个job                 //通过主类的类加载器机制获取到本job的所有代码所在的jar包         wcjob.setJarByClass(JobRunner.class);                 //指定本job使用的mapper类         wcjob.setMapperClass(MapperWordCount.class);                 //指定本job使用的reducer类         wcjob.setReducerClass(ReducerWordCount.class);                 //指定mapper输出的kv的数据类型         wcjob.setMapOutputKeyClass(Text.class);         wcjob.setMapOutputValueClass(IntWritable.class);                 //指定reducer输出的kv数据类型         wcjob.setOutputKeyClass(Text.class);         wcjob.setOutputValueClass(IntWritable.class);                 //指定本job要处理的文件所在的路径         FileInputFormat.setInputPaths(wcjob, new Path("/user/weblogic/mr/wordcount/"));                 //指定本job输出的结果文件放在哪个路径         FileOutputFormat.setOutputPath(wcjob, new Path("/user/weblogic/mr/output/"));                 //将本job向hadoop集群提交执行         boolean res=wcjob.waitForCompletion(true);                  System.exit(res?0:1);//执行成功的话正常退出系统执行有误则终止执行    } }

运行:直接mvn install,把jar文件上传到服务器,就可以hadoop jar test.jar了;

使用maven生成的jar文件,有一个with-dependency的,那个可以直接运行,还有一个小一点的jar包,那里面没有包含主类信息,需要自己手动补充主类才可以。

在hadoop服务器环境下,其他的类信息是不缺少的。

如果不进行手动的MANIFEST.MF文件修改,那么就在运行的时候添加上主类信息:hadoop jar mrbasis-s.jar com.stono.basis.JobRunner

简化的pom.xml文件:

    4.0.0modelVersion>     com.stonogroupId>     mr01artifactId>     1.0-SNAPSHOTversion>     jarpackaging>     mr01name>     http://maven.apache.orgurl>              UTF-8project.build.sourceEncoding>         1.7java.version>         UTF-8project.reporting.outputEncoding>         yyyy-MM-dd HH:mm:ssmaven.build.timestamp.format>         2.7.2hadoop-mapreduce-client.version>         1.1.2hbase-client.version>         1.7.25slf4j.version>         0.10.2.1kafka-client.version>     properties>                           jdk.toolsgroupId>             jdk.toolsartifactId>             1.8version>             systemscope>             D:/Java/jdk1.8.0_161/lib/tools.jarsystemPath>         dependency>          日志记录 Slf4j -->                      org.slf4jgroupId>             slf4j-apiartifactId>             ${slf4j.version}version>         dependency>          mapreduce -->                      org.apache.hadoopgroupId>             hadoop-commonartifactId>             ${hadoop-mapreduce-client.version}version>         dependency>                      org.apache.hadoopgroupId>             hadoop-mapreduce-client-coreartifactId>             ${hadoop-mapreduce-client.version}version>         dependency>                      junitgroupId>             junitartifactId>             3.8.1version>             testscope>         dependency>     dependencies>                                            maven-compiler-pluginartifactId>                 2.3.2version>                                      1.7source>                     1.7target>                 configuration>             plugin>                              org.apache.maven.pluginsgroupId>                 maven-jar-pluginartifactId>                                                                                            falseaddClasspath>                             com.bsr.basis.JobRunnermainClass>  你的主类名 -->                         manifest>                     archive>                 configuration>             plugin>             -->             maven-assembly-plugin-->             -->             -->             jar-with-dependencies-->             -->             -->             -->             com.bsr.basis.JobRunner-->             -->             -->             -->             -->             -->             make-assembly-->             package-->             -->             single-->             -->             -->             -->             -->         plugins>     build>project>

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:Java使用Collections.sort对中文进行排序方式
下一篇:搭建hadoop分布式集群以及大数据开发环境(配置hdfs,yarn,mapreduce等)
相关文章

 发表评论

暂时没有评论,来抢沙发吧~