MapReduce----电信数据清洗
生活随笔
收集整理的這篇文章主要介紹了
MapReduce----电信数据清洗
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
MapReduce---電信數(shù)據(jù)清洗
- 數(shù)據(jù)解析及題目分析
- 數(shù)據(jù)解析
- 題目及分析
- 代碼實現(xiàn)
- 自定義類
- Map階段
- Reduce階段
- Driver階段
數(shù)據(jù)解析及題目分析
數(shù)據(jù)解析
數(shù)據(jù)一
18620192711,15733218050,1506628174,1506628265,650000,810000 18641241020,15733218050,1509757276,1509757464,330000,620000 15778423030,15614201525,1495290451,1495290923,370000,420000 13341109505,15151889601,1492661762,1492662200,330000,460000 13341109505,13666666666,1470111026,1470111396,360000,230000 15032293356,13799999999,1495937181,1495937360,500000,630000 15733218050,13341109505,1452601976,1452602401,620000,530000 13269361119,13269361119,1487640690,1487641023,450000,430000 13799999999,15338595369,1511928814,1511929111,540000,230000 15733218050,15778423030,1542457633,1542457678,450000,530000 13341109505,17731088562,1484364844,1484365342,460000,360000 18332562075,15778423030,1522426275,1522426473,140000,120000 13560190665,18301589432,1485648596,1485648859,620000,820000 15733218050,13520404983,1538992531,1538992605,130000,150000 15778423030,13566666666,1484008721,1484009210,810000,330000 13566666666,17731088562,1541812913,1541813214,220000,360000 15778423030,15733218050,1464198621,1464198803,630000,340000 15151889601,13341109505,1467441052,1467441538,640000,440000 18620192711,13666666666,1510997876,1510998253,450000,610000 13341109505,18641241020,1509074946,1509075201,710000,310000 17731088562,13341109505,1471571270,1471571706,430000,630000 13520404983,13560190665,1476626194,1476626683,500000,440000 15338595369,13341109505,1523996031,1523996059,420000,460000 15151889601,13341109505,1489658199,1489658394,330000,500000 13560190665,15338595369,1510890681,1510891129,410000,520000 15733218050,13566666666,1503498540,1503498726,420000,310000 17731088562,13560190665,1470571255,1470571708,540000,330000 15338595369,15614201525,1496767879,1496768364,520000,500000 17731088562,15778423030,1494602567,1494602784,500000,420000 15778423030,18641241020,1517445007,1517445358,450000,530000 13566666666,17731088562,1464697765,1464697894,360000,620000 15778423030,13799999999,1525543218,1525543493,500000,820000 13341109505,13520404983,1521861238,1521861421,500000,130000 13566666666,13560190665,1513918160,1513918538,340000,210000 15032293356,18620192711,1485688388,1485688537,540000,530000 13799999999,13341109505,1531196363,1531196438,230000,320000 15338595369,15151889601,1512125514,1512125978,540000,810000 18332562075,13560190665,1523311951,1523312239,650000,410000 15778423030,15032293356,1467953782,1467954054,810000,540000 15151889601,15733218050,1530848147,1530848231,310000,150000 13269361119,18301589432,1541271874,1541272273,310000,310000 15032293356,15338595369,1520833915,1520834201,450000,360000 15778423030,13269361119,1452817391,1452817596,820000,410000 13520404983,18332562075,1474563316,1474563593,710000,540000 18301589432,15778423030,1473596284,1473596528,620000,310000 15732648446,15151889601,1535584645,1535585117,530000,310000 18301589432,13269361119,1511910316,1511910341,340000,320000 13560190665,18641241020,1533379659,1533379717,120000,710000 15338595369,18332562075,1474152847,1474153092,330000,500000 13520404983,17731088562,1504907456,1504907617,820000,510000 15732648446,18301589432,1521692836,1521692977,220000,370000 15032293356,15614201525,1471445293,1471445756,360000,530000 18641241020,15778423030,1517192728,1517193050,210000,610000 17731088562,15733218050,1493420249,1493420555,370000,820000 18620192711,13799999999,1477952709,1477953088,310000,140000 13666666666,13799999999,1541066076,1541066541,230000,640000 13269361119,17731088562,1540060141,1540060511,150000,540000 18332562075,13799999999,1489772390,1489772817,540000,710000 13799999999,15732648446,1503882021,1503882332,530000,520000 13566666666,15614201525,1504983084,1504983241,820000,140000 18641241020,15032293356,1463447030,1463447080,330000,640000 18301589432,13566666666,1493646451,1493646796,310000,510000 15732648446,15032293356,1537185125,1537185619,430000,810000 15338595369,13341109505,1493411872,1493411891,370000,150000 15778423030,17731088562,1540631847,1540632271,320000,500000 13666666666,15614201525,1545200734,1545200959,360000,640000 15032293356,13799999999,1455000970,1455001084,460000,650000 18641241020,18620192711,1529968498,1529968626,410000,510000 17731088562,15732648446,1455361378,1455361505,440000,650000 18301589432,13666666666,1518564232,1518564421,210000,640000 15733218050,18620192711,1515672794,1515673149,360000,360000 13520404983,18620192711,1521620546,1521620913,820000,370000 18332562075,18641241020,1498131159,1498131300,820000,230000 13666666666,18301589432,1491354142,1491354544,220000,710000 18301589432,15614201525,1511731560,1511732015,810000,620000 13269361119,13666666666,1539065031,1539065096,810000,810000 15778423030,18641241020,1518364528,1518364995,130000,610000 15733218050,15032293356,1491974898,1491975316,340000,810000 13269361119,15733218050,1543514850,1543514946,410000,460000 13341109505,13666666666,1482223100,1482223577,220000,410000 15338595369,13341109505,1495958992,1495959292,330000,420000 13341109505,18641241020,1511010003,1511010292,540000,620000 18620192711,13269361119,1462453298,1462453559,320000,360000 13666666666,13799999999,1518047527,1518047967,640000,420000 13341109505,13666666666,1474872886,1474872907,360000,510000 13666666666,18641241020,1473575493,1473575663,150000,520000 15151889601,15732648446,1509418483,1509418891,510000,540000 13560190665,13520404983,1467696946,1467697103,150000,460000 13520404983,15614201525,1510958686,1510959064,320000,610000 15778423030,15614201525,1470012457,1470012660,210000,210000 15778423030,17731088562,1542680029,1542680382,630000,520000 18332562075,15338595369,1453896030,1453896522,640000,370000 15032293356,18620192711,1488286898,1488287248,530000,150000 18641241020,15733218050,1489804133,1489804185,150000,630000 15733218050,13666666666,1506782751,1506782854,220000,500000 13520404983,17731088562,1487421622,1487421784,230000,330000 15151889601,13269361119,1538113862,1538113902,370000,630000 15778423030,17731088562,1466691118,1466691412,540000,530000 15032293356,13520404983,1521151509,1521151701,520000,430000 15614201525,13666666666,1464083166,1464083352,330000,650000字段解析:呼叫者手機(jī)號,接受者手機(jī)號,開始時間戳,接受時間戳,呼叫者地址省份編碼,接受者地址省份編碼
數(shù)據(jù)二
1,110000,北京市 2,120000,天津市 3,130000,河北省 4,140000,山西省 5,150000,內(nèi)蒙古自治區(qū) 6,210000,遼寧省 7,220000,吉林省 8,230000,黑龍江省 9,310000,上海市 10,320000,江蘇省 11,330000,浙江省 12,340000,安徽省 13,350000,福建省 14,360000,江西省 15,370000,山東省 16,410000,河南省 17,420000,湖北省 18,430000,湖南省 19,440000,廣東省 20,450000,廣西壯族自治區(qū) 21,460000,海南省 22,500000,重慶市 23,510000,四川省 24,520000,貴州省 25,530000,云南省 26,540000,西藏自治區(qū) 27,610000,陜西省 28,620000,甘肅省 29,630000,青海省 30,640000,寧夏回族自治區(qū) 31,650000,新疆維吾爾自治區(qū) 32,710000,臺灣省 33,810000,香港特別行政區(qū) 34,820000,澳門特別行政區(qū)字段解析:地址id,省份編碼,省份名稱
數(shù)據(jù)三
7,18000696806,趙賀彪 8,15151889601,張倩 9,13269361119,王世昌 10,15032293356,張濤 11,17731088562,張陽 12,15338595369,李進(jìn)全 13,15733218050,杜澤文 14,15614201525,任宗陽 15,15778423030,梁鵬 16,18641241020,郭美彤 17,15732648446,劉飛飛 18,13341109505,段光星 19,13560190665,唐會華 20,18301589432,楊力謀 21,13520404983,溫海英 22,18332562075,朱尚寬 23,18620192711,劉能宗 24,13566666666,劉柳 25,13666666666,鄧二 26,13799999999,菜中路字段解析:電話ID,電話號碼,姓名
題目及分析
- 需求一和需求四可以將數(shù)據(jù)二和數(shù)據(jù)三緩存到內(nèi)存里,然后進(jìn)行替換操作
- 需求二簡單的時間類型轉(zhuǎn)換
- 需求三日期類型的加減
代碼實現(xiàn)
自定義類
import org.apache.hadoop.io.WritableComparable;import java.io.DataInput; import java.io.DataOutput; import java.io.IOException;public class Data implements WritableComparable<Data> {private String name_A;private String name_B;private String phoneNum_A;private String phoneNum_B;private String startTime;private String endTime;private String phoneLong;private String location_A;private String location_B;@Overridepublic int compareTo(Data o) {return 0;}@Overridepublic void write(DataOutput dataOutput) throws IOException {dataOutput.writeUTF(name_A);dataOutput.writeUTF(name_B);dataOutput.writeUTF(phoneNum_A);dataOutput.writeUTF(phoneNum_B);dataOutput.writeUTF(startTime);dataOutput.writeUTF(endTime);dataOutput.writeUTF(phoneLong);dataOutput.writeUTF(location_A);dataOutput.writeUTF(location_B);}@Overridepublic void readFields(DataInput dataInput) throws IOException {name_A = dataInput.readUTF();name_B = dataInput.readUTF();phoneNum_A = dataInput.readUTF();phoneNum_B = dataInput.readUTF();startTime = dataInput.readUTF();endTime = dataInput.readUTF();phoneLong = dataInput.readUTF();location_A = dataInput.readUTF();location_B = dataInput.readUTF();}public void set(String name_A, String name_B, String phoneNum_A, String phoneNum_B, String startTime, String endTime, String phoneLong, String location_A, String location_B) {this.name_A = name_A;this.name_B = name_B;this.phoneNum_A = phoneNum_A;this.phoneNum_B = phoneNum_B;this.startTime = startTime;this.endTime = endTime;this.phoneLong = phoneLong;this.location_A = location_A;this.location_B = location_B;}@Overridepublic String toString() {return name_A + "," +name_B + "," +phoneNum_A + "," +phoneNum_B + "," +startTime + "," +endTime + "," +phoneLong + "," +location_A + "," +location_B;}public String getName_A() {return name_A;}public void setName_A(String name_A) {this.name_A = name_A;}public String getName_B() {return name_B;}public void setName_B(String name_B) {this.name_B = name_B;}public String getPhoneNum_A() {return phoneNum_A;}public void setPhoneNum_A(String phoneNum_A) {this.phoneNum_A = phoneNum_A;}public String getPhoneNum_B() {return phoneNum_B;}public void setPhoneNum_B(String phoneNum_B) {this.phoneNum_B = phoneNum_B;}public String getStartTime() {return startTime;}public void setStartTime(String startTime) {this.startTime = startTime;}public String getEndTime() {return endTime;}public void setEndTime(String endTime) {this.endTime = endTime;}public String getPhoneLong() {return phoneLong;}public void setPhoneLong(String phoneLong) {this.phoneLong = phoneLong;}public String getLocation_A() {return location_A;}public void setLocation_A(String location_A) {this.location_A = location_A;}public String getLocation_B() {return location_B;}public void setLocation_B(String location_B) {this.location_B = location_B;} }Map階段
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper;import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.net.URI; import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map;public class MapTest extends Mapper<LongWritable, Text, Data, NullWritable> {private Data k = new Data();private Map<String, String> userName = new HashMap<String, String>();private Map<String, String> location = new HashMap<String, String>();//用戶姓名private String name_A;private String name_B;//用戶地址private String loc_A;private String loc_B;//通話時間的轉(zhuǎn)換private String startTime;private String endTime;//通話時間private String time;@Overrideprotected void setup(Context context) throws IOException, InterruptedException {URI[] uris = context.getCacheFiles();File user = new File(uris[0]);String line;//緩存用戶姓名信息BufferedReader br;br = new BufferedReader(new FileReader(user));while ((line = br.readLine()) != null) {userName.put(line.split(",")[1], line.split(",")[2]);}//緩存地址信息File loc = new File(uris[1]);br = new BufferedReader(new FileReader(loc));while ((line = br.readLine()) != null) {location.put(line.split(",")[1], line.split(",")[2]);}}@Overrideprotected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {String[] data = value.toString().split(",");//將用戶號碼轉(zhuǎn)換成姓名name_A = userName.get(data[0]);name_B = userName.get(data[1]);//將時間戳轉(zhuǎn)換成日期類型SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");startTime = sdf.format(Long.parseLong(data[2]));endTime = sdf.format(Long.parseLong(data[3]));//計算通話時間time = Long.parseLong(data[3]) - Long.parseLong(data[2]) + "秒";//替換地址位置loc_A = location.get(data[4]);loc_B = location.get(data[5]);//寫出數(shù)據(jù)k.set(name_A, name_B, data[0], data[1], startTime, endTime, time, loc_A, loc_B);context.write(k, NullWritable.get());} }Reduce階段
import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Reducer;import java.io.IOException;public class RedTest extends Reducer<Data, NullWritable,Data,NullWritable> {@Overrideprotected void reduce(Data key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {//直接寫出數(shù)據(jù)即可for (NullWritable v:values){context.write(key,NullWritable.get());}} }Driver階段
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import java.io.File; import java.net.URI;public class DriTest {public static void main(String[] args) throws Exception {File file = new File("D:\\MP\\電信\\output");if (file.exists()) {delFile(file);driver();} else {driver();}}public static void delFile(File file) {File[] files = file.listFiles();if (files != null && files.length != 0) {for (int i = 0; i < files.length; i++) {delFile(files[i]);}}file.delete();}public static void driver() throws Exception {Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setMapperClass(MapTest.class);job.setJarByClass(DriTest.class);job.setReducerClass(RedTest.class);job.setMapOutputKeyClass(Data.class);job.setMapOutputValueClass(NullWritable.class);job.setOutputKeyClass(Data.class);job.setOutputValueClass(NullWritable.class);URI [] uris = new URI[2];uris[0] = new URI("file:///D:/MP/電信/input/userPhone.txt");uris[1] = new URI("file:///D:/MP/電信/input/location.txt");job.setCacheFiles(uris);FileInputFormat.setInputPaths(job, "D:\\MP\\電信\\input\\data.txt");FileOutputFormat.setOutputPath(job, new Path("D:\\MP\\電信\\output"));boolean b = job.waitForCompletion(true);System.exit(b ? 0 : 1);} }總結(jié)
以上是生活随笔為你收集整理的MapReduce----电信数据清洗的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: p3966单词
- 下一篇: 浅论三维标注技术的重要性