0


旅游网站之数据分析

文章目录

第1关:统计每个城市的宾馆平均价格

编程要求

在右侧代码窗口完成代码编写:

1:MapReduce类已经配置好,只需完成MapReduce的数据分析;
2:在Map节点执行类中把城市ID当成的输出key,酒店价格当成Mapper类的输出value;
3:在Reduce节点执行类中,统计以城市ID为维度的酒店价格均价,并保存到Hbase;需要满足ROWKEY为城市ID、列族为average_infos、表字段名称为price,计算出的价格均价为表字段值。

t_city_hotels_info表结构如下
列族名称字段对应的文件数据字段描述ROWKEY (格式为:城市ID_酒店ID)cityInfocityIdcity_id城市IDcity_id + “_” + idcityInfocityNamecity_name城市名称city_id + “_” + idcityInfopinyinpinyin城市拼音city_id + “_” + idhotel_infoidid酒店idcity_id + “_” + idhotel_infonamename酒店名称city_id + “_” + idhotel_infopriceprice酒店价格city_id + “_” + idhotel_infolonlon经度city_id + “_” + idhotel_infourlurlurl地址hotel_infoimgimg图片city_id + “_” + idhotel_infoaddressaddress地址city_id + “_” + idhotel_infoscorescore得分city_id + “_” + idhotel_infodpscoredpscore用户评分city_id + “_” + idhotel_infodpcountdpcount评分个数city_id + “_” + idhotel_infostarstar星级city_id + “_” + idhotel_infostardescstardesc舒适度city_id + “_” + idhotel_infoshortNameshortName酒店简称city_id + “_” + id
测试说明
平台会对你编写的代码进行测试:

测试输入:

t_city_hotels_info,average_table;

预期输出:

row:58
average_infos:price 1145.6170212765958
row:59
average_infos:price 797.2197802197802

代码

packagecom.processdata;importjava.io.IOException;importjava.util.Scanner;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.conf.Configured;importorg.apache.hadoop.hbase.HBaseConfiguration;importorg.apache.hadoop.hbase.client.Connection;importorg.apache.hadoop.hbase.client.Put;importorg.apache.hadoop.hbase.client.Result;importorg.apache.hadoop.hbase.client.Scan;importorg.apache.hadoop.hbase.io.ImmutableBytesWritable;importorg.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;importorg.apache.hadoop.hbase.mapreduce.TableMapper;importorg.apache.hadoop.hbase.mapreduce.TableReducer;importorg.apache.hadoop.hbase.util.Bytes;importorg.apache.hadoop.io.DoubleWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.util.Tool;importorg.apache.hadoop.util.ToolRunner;importcom.util.HBaseUtil;/**
 * 使用MapReduce程序处理HBase中的数据并将最终结果存入到另一张表 1中
 */publicclassHBaseMapReduceextendsConfiguredimplementsTool{publicstaticclassMyMapperextendsTableMapper<Text,DoubleWritable>{publicstaticfinalbyte[] column ="price".getBytes();publicstaticfinalbyte[] family ="hotel_info".getBytes();@Overrideprotectedvoidmap(ImmutableBytesWritable rowKey,Result result,Context context)throwsIOException,InterruptedException{/********** Begin *********/String cityId =Bytes.toString(result.getValue("cityInfo".getBytes(),"cityId".getBytes()));byte[] value = result.getValue(family, column);Double value1 =Double.parseDouble(Bytes.toString(value));DoubleWritable i =newDoubleWritable(value1);String priceKey = cityId;     
            context.write(newText(priceKey),i);/********** End *********/}}publicstaticclassMyTableReducerextendsTableReducer<Text,DoubleWritable,ImmutableBytesWritable>{@Overridepublicvoidreduce(Text key,Iterable<DoubleWritable> values,Context context)throwsIOException,InterruptedException{/********** Begin *********/double sum =0;int len =0;for(DoubleWritable price : values){ 
                len ++; 
                sum += price.get();}Put put =newPut(Bytes.toBytes(key.toString())); 
            put.addColumn("average_infos".getBytes(),"price".getBytes(),Bytes.toBytes(String.valueOf(sum / len))); 
            context.write(null, put);/********** End *********/}}publicintrun(String[] args)throwsException{//配置JobConfiguration conf =HBaseConfiguration.create(getConf());
        conf.set("hbase.zookeeper.quorum","127.0.0.1");//hbase 服务地址
        conf.set("hbase.zookeeper.property.clientPort","2181");//端口号Scanner sc =newScanner(System.in);String arg1 = sc.next();String arg2 = sc.next();//String arg1 = "t_city_hotels_info";//String arg2 = "average_table";try{HBaseUtil.createTable("average_table",newString[]{"average_infos"});}catch(Exception e){// 创建表失败
            e.printStackTrace();}Job job =configureJob(conf,newString[]{arg1,arg2});return job.waitForCompletion(true)?0:1;}privateJobconfigureJob(Configuration conf,String[] args)throwsIOException{String tablename = args[0];String targetTable = args[1];Job job =newJob(conf,tablename);Scan scan =newScan();
        scan.setCaching(300);
        scan.setCacheBlocks(false);//在mapreduce程序中千万不要设置允许缓存//初始化Mapreduce程序TableMapReduceUtil.initTableMapperJob(tablename,scan,MyMapper.class,Text.class,DoubleWritable.class,job);//初始化ReduceTableMapReduceUtil.initTableReducerJob(
                targetTable,// output tableMyTableReducer.class,// reducer class
                job);
        job.setNumReduceTasks(1);return job;}}

第2关:统计酒店评论中词频较高的词

编程要求

在右侧代码窗口完成代码编写,MapReduce类已经配置好,只需完成MapReduce的数据分析,你只需将所有分词后的数据存入新表中,平台将会为你输出词频大于100的词组:

1:在Map节点执行类中把评论进行分词当成输出key,Mapper类的输出value为固定值1。

2:在Reduce节点执行类中,统计以评论中分词后的词组为维度的词频数量,并保存到Hbase。需要满足ROWKEY为评论分词、列族为 word_info 、表字段名称为 count 。

t_hotel_comment表结构如下
列族名称字段对应的文件数据字段描述ROWKEY (格式为:城市ID_酒店ID)hotel_infohotel_namehotel_name酒店名称Hotel_id+ “_” + idhotel_infohotel_idhotel_id酒店IDHotel_id+ “_” + idcomment_infoidid评论idHotel_id+ “_” + idcomment_infobaseRoomIdbaseRoomId房间类型Hotel_id+ “_” + idcomment_infocontentcontent评论内容Hotel_id+ “_” + idcomment_infocheckInDatecheckInDate入住时间Hotel_id+ “_” + idcomment_infopostDatepostDate离开时间Hotel_id+ “_” + idcomment_infouserNickNameuserNickName用户昵称Hotel_id+ “_” + id
测试说明
平台会对你编写的代码进行测试:

测试输入:

t_hotel_comment,comment_word_count;

预期输出:

word:不错
word_info:count 344
word:位置
word_info:count 159
word:住
word_info:count 150
word:免费
word_info:count 110
word:入住
word_info:count 112
word:卫生
word_info:count 106
word:地铁站
word_info:count 144
word:巴士
word_info:count 174
word:干净
word_info:count 211
word:很好
word_info:count 200
word:性价比
word_info:count 123
word:房间
word_info:count 449
word:早餐
word_info:count 116
word:环境
word_info:count 166
word:葵
word_info:count 112
word:酒店
word_info:count 970
word:香港
word_info:count 224

代码

packagecom.processdata;importjava.io.IOException;importjava.util.List;importjava.util.Scanner;importorg.apache.hadoop.conf.Configuration;importorg.apache.hadoop.conf.Configured;importorg.apache.hadoop.hbase.HBaseConfiguration;importorg.apache.hadoop.hbase.client.Connection;importorg.apache.hadoop.hbase.client.Put;importorg.apache.hadoop.hbase.client.Result;importorg.apache.hadoop.hbase.client.Scan;importorg.apache.hadoop.hbase.io.ImmutableBytesWritable;importorg.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;importorg.apache.hadoop.hbase.mapreduce.TableMapper;importorg.apache.hadoop.hbase.mapreduce.TableReducer;importorg.apache.hadoop.hbase.util.Bytes;importorg.apache.hadoop.io.IntWritable;importorg.apache.hadoop.io.Text;importorg.apache.hadoop.mapreduce.Job;importorg.apache.hadoop.util.Tool;importorg.apache.hadoop.util.ToolRunner;importorg.apdplat.word.WordSegmenter;importorg.apdplat.word.segmentation.Word;importcom.util.HBaseUtil;importcom.vdurmont.emoji.EmojiParser;/**
 * 词频统计
 *
 */publicclassWorldCountMapReduceextendsConfiguredimplementsTool{publicstaticclassMyMapperextendsTableMapper<Text,IntWritable>{privatestaticbyte[] family ="comment_info".getBytes();privatestaticbyte[] column ="content".getBytes();@Overrideprotectedvoidmap(ImmutableBytesWritable rowKey,Result result,Context context)throwsIOException,InterruptedException{/********** Begin *********/byte[] value = result.getValue(family, column);String word =newString(value,"utf-8");if(!word.isEmpty()){String filter =EmojiParser.removeAllEmojis(word);List<Word> segs =WordSegmenter.seg(filter);for(Word cont : segs){Text text =newText(cont.getText());IntWritable v =newIntWritable(1); 
                    context.write(text,v);}}/********** End *********/}}publicstaticclassMyReducerextendsTableReducer<Text,IntWritable,ImmutableBytesWritable>{privatestaticbyte[] family ="word_info".getBytes();privatestaticbyte[] column ="count".getBytes();@Overridepublicvoidreduce(Text key,Iterable<IntWritable> values,Context context)throwsIOException,InterruptedException{/********** Begin *********/int sum =0;for(IntWritable value : values) 
                sum += value.get();Put put =newPut(Bytes.toBytes(key.toString())); 
            put.addColumn(family,column,Bytes.toBytes(sum)); 
            context.write(null,put);/********** End *********/}}publicintrun(String[] args)throwsException{//配置JobConfiguration conf =HBaseConfiguration.create(getConf());
        conf.set("hbase.zookeeper.quorum","127.0.0.1");//hbase 服务地址
        conf.set("hbase.zookeeper.property.clientPort","2181");//端口号Scanner sc =newScanner(System.in);String arg1 = sc.next();String arg2 = sc.next();try{HBaseUtil.createTable("comment_word_count",newString[]{"word_info"});}catch(Exception e){// 创建表失败
            e.printStackTrace();}Job job =configureJob(conf,newString[]{arg1,arg2});return job.waitForCompletion(true)?0:1;}privateJobconfigureJob(Configuration conf,String[] args)throwsIOException{String tablename = args[0];String targetTable = args[1];Job job =newJob(conf,tablename);Scan scan =newScan();
        scan.setCaching(300);
        scan.setCacheBlocks(false);//在mapreduce程序中千万不要设置允许缓存//初始化Mapper Reduce程序TableMapReduceUtil.initTableMapperJob(tablename,scan,MyMapper.class,Text.class,IntWritable.class,job);TableMapReduceUtil.initTableReducerJob(targetTable,MyReducer.class,job);
        job.setNumReduceTasks(1);return job;}}

本文转载自: https://blog.csdn.net/qq_52331221/article/details/127345086
版权归原作者 随兴随缘 所有, 如有侵权,请联系我们删除。

“旅游网站之数据分析”的评论:

还没有评论