My Data set : ( scores.dat )
sachin,1996,86
sachin,1996,75
sachin,1996,145
sachin,1996,98
sachin,1997,97
sachin,1997,65
sachin,1996,98
sachin,1996,54
sachin,1998,98
sachin,1998,53
sachin,1997,34
sachin,1997,54
sachin,1997,54
sachin,1997,23
nikhil,1996,56
nikhil,1996,54
nikhil,1997,43
nikhil,1998,89
nikhil,1996,32
nikhil,1997,54
nikhil,1998,45
nikhil,1996,32
nikhil,1996,43
akash,1996,122
akash,1996,98
akash,1997,12
akash,1998,23
akash,1996,87
akash,1997,65
akash,1998,65
akash,1996,32
akash,1996,73
mapper.py ( /user/hduser/mapper.py )
#!/usr/bin/env python
import sys
for line in sys.stdin:
(val1,val2,val3) = line.strip().split(",")
print "%s\t%s" % (val1, val3)
#!/usr/bin/env python
import sys
for line in sys.stdin:
(val1,val2,val3) = line.strip().split(",")
print "%s\t%s" % (val1, val3)
reducer.py ( /user/hduser/reducer.py )
#!/usr/bin/env python
from operator import itemgetter
import sys
( last_name , max_val ) = ( None , -sys.maxint )
# input comes from STDIN
for line in sys.stdin:
# remove leading and trailing whitespace
name,val = line.strip().split("\t")
if last_name and last_name != name:
print "%s\t%s" %(last_name , max_val)
(last_name , max_val) = (name, int(val))
else:
(last_name , max_val) = (name , max(max_val,int(val)))
if last_name:
print "%s\t%s" % (last_name,max_val)
Test first with in unix shell:
cat scores.dat | python mapper.py | sort -k1,1 | python reducer.py
akash 122
nikhil 89
sachin 145
Map reduce command for running the job :
${HADOOP_HOME}/bin/hadoop jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar -file /home/hduser/mapper.py -mapper /home/hduser/mapper.py -file /home/hduser/reducer.py -reducer /home/hduser/reducer.py -input /user/hduser/scores -output /user/hduser/scores-out
#!/usr/bin/env python
from operator import itemgetter
import sys
( last_name , max_val ) = ( None , -sys.maxint )
# input comes from STDIN
for line in sys.stdin:
# remove leading and trailing whitespace
name,val = line.strip().split("\t")
if last_name and last_name != name:
print "%s\t%s" %(last_name , max_val)
(last_name , max_val) = (name, int(val))
else:
(last_name , max_val) = (name , max(max_val,int(val)))
if last_name:
print "%s\t%s" % (last_name,max_val)
Test first with in unix shell:
cat scores.dat | python mapper.py | sort -k1,1 | python reducer.py
akash 122
nikhil 89
sachin 145
Map reduce command for running the job :
${HADOOP_HOME}/bin/hadoop jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar -file /home/hduser/mapper.py -mapper /home/hduser/mapper.py -file /home/hduser/reducer.py -reducer /home/hduser/reducer.py -input /user/hduser/scores -output /user/hduser/scores-out
No comments:
Post a Comment