20-hadoop-pagerank的计算

转: http://www.cnblogs.com/rubinorth/p/5799848.html

　　参考尚学堂视频

1, 概念( 来自百度百科)

PageRank是Google专有的算法，用于衡量特定网页相对于搜索引擎索引中的其他网页而言的重要程度。它由Larry Page 和 Sergey Brin在20世纪90年代后期发明。PageRank实现了将链接价值概念作为排名因素。

PageRank让链接来”投票”一个页面的“得票数”由所有链向它的页面的重要性来决定，到一个页面的超链接相当于对该页投一票。一个页面的PageRank是由所有链向它的页面（“链入页面”）的重要性经过递归算法得到的。一个有较多链入的页面会有较高的等级，相反如果一个页面没有任何链入页面，那么它没有等级。 2, 原理

), 入链 == 投票让链接投票, 到一个页面的超链(入链)就是一次投票), 入链数量越多, pr值越高), 入链质量越高, pr值越高

收敛计算

), 初始值
     每个页面初始pr值相同
), 迭代递归(收敛)
    不断的重复计算, 最后pr值趋于稳定, 就是收敛
    a), 绝对收敛: 每个页面的pr值和上一次的pr值相同
    b), 相对收敛: 所有页面和上一次的计算pr平均差值小于收敛差值(0.0001)
    c), 百分比页面(%) 和上一次计算值相同

修正的pr计算公式

由于一些页面出链为0, 对其他网页没有贡献, 于是设定所有网页都有出链, 即他自己, 因为增加阻尼系数q, 一般为 0.85

详细推导请转大牛网页: http://www.cnblogs.com/rubinorth/p/5799848.html

3, hadoop实现

1), 定义node

package com.wenbronk.pagerank;import java.io.IOException;
import java.util.Arrays;import org.apache.commons.lang.StringUtils;/**
 * pageRank 工具代码
 *
 * @author root
 *
 */
public class Node {
    /** 初始值 */
    private Double pageRank = 1d;
    /** 出链 */
    private String[] adjacentNodeNames;
    public static final char fieldSeparator = '\t';    public Double getPageRank() {
        return pageRank;
    }    public Node setPageRank(Double pageRank) {
        this.pageRank = pageRank;
        return this;
    }    public String[] getAdjacentNodeNames() {
        return adjacentNodeNames;
    }    public Node setAdjacentNodeNames(String[] adjacentNodeNames) {
        this.adjacentNodeNames = adjacentNodeNames;
        return this;
    }    public boolean containsAdjacentNodes() {
        return adjacentNodeNames != null && adjacentNodeNames.length > ;
    }    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(pageRank);        if (getAdjacentNodeNames() != null) {
            sb.append(fieldSeparator).append(StringUtils.join(getAdjacentNodeNames(), fieldSeparator));
        }
        return sb.toString();
    }    // value =1.0 B D
    public static Node fromMR(String value) throws IOException {
        String[] parts = StringUtils.splitPreserveAllTokens(value, fieldSeparator);
        if (parts.length < ) {
            throw new IOException("Expected 1 or more parts but received " + parts.length);
        }
        Node node = new Node().setPageRank(Double.valueOf(parts[]));
        if (parts.length > ) {
            node.setAdjacentNodeNames(Arrays.copyOfRange(parts, , parts.length));
        }
        return node;
    }}

2, mapper

package com.wenbronk.pagerank;import java.io.IOException;import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;/**
 * map
 * @author wenbronk
 */
public class PageRankMapper extends Mapper<Text, Text, Text, Text> {    /**
     * 输入为 A: B D 第一次 A: 1 B D
     */
    @Override
    protected void map(Text key, Text value, Mapper<Text, Text, Text, Text>.Context context)
            throws IOException, InterruptedException {
        // 计算的次数, 通过context传入
        int runCount = context.getConfiguration().getInt("runCount", );
        String key_ = key.toString();
        Node node = null;        if (runCount == ) {
            node = Node.fromMR("1.0\t" + value.toString());
        } else {
            node = Node.fromMR(value.toString());
        }
        // 有出链
        if (node.containsAdjacentNodes()) {
            Double outValue = node.getPageRank() / node.getAdjacentNodeNames().length;            // 输出 A: 1.0    B    D, 输出原始值
            context.write(new Text(key_), new Text(node.toString()));
            for (int i = ; i < node.getAdjacentNodeNames().length; i++) {
                String outKey = node.getAdjacentNodeNames()[i];
                // 输出 B: 0.5; D: 0.5;
                context.write(new Text(outKey), new Text(outValue + ""));
            }
        }    }}

3, reducer

package com.wenbronk.pagerank;import java.io.IOException;import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;/**
 *
 * @author wenbronk
 */
public class PageRankReducer extends Reducer<Text, Text, Text, Text>{    /**
     * map 传入的数据
     * 实现pageRank的算法
     */
    @Override
    protected void reduce(Text arg0, Iterable<Text> arg1, Reducer<Text, Text, Text, Text>.Context arg2)
            throws IOException, InterruptedException {        Double sum = 0d;        // 原始值
        Node sourceNode = null;
        for (Text text : arg1) {
            Node node = Node.fromMR(text.toString());
            if (node.containsAdjacentNodes()) {
                sourceNode = node;
            }else {
                sum += node.getPageRank();
            }
        }
        // 0.85 为公式阻尼系数
        Double newPR = ( - 0.85) / 4.0 + (0.85 * sum);        // 原始值比较, 扩大倍数, 根据收敛值确定(0.001, 就是1000)
        Double d = Math.abs((newPR - sourceNode.getPageRank()) * 1000.0);
        System.err.println(d);
        // 计数器, 确定最后停止标准
        arg2.getCounter(Counter.Counter).increment(d.intValue());        sourceNode.setPageRank(newPR);
        arg2.write(arg0, new Text(sourceNode.toString()));
    }}

4, 需要用到hadoop的计数器, 定义一个枚举

package com.wenbronk.pagerank;public enum Counter {    Counter
}

5, 执行类

package com.wenbronk.pagerank;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/**
 *
 * @author root
 */
public class PageRankMain {    public static void main(String[] args) {        double d =0.001;        Configuration config = new Configuration();
        config.set("fs.default", "hdfs://192.168.208.106:8020");
        config.set("yarn.resourcemanager.hostname", "192.168.208.106");        int runCount = ;
        while (true) {
            runCount ++;
            try {
                config.setInt("runCount", runCount);
                Job job = Job.getInstance(config);
                job.setJobName("newPr" + runCount);
                job.setJarByClass(PageRankMain.class);
                job.setMapperClass(PageRankMapper.class);
                job.setReducerClass(PageRankReducer.class);
                job.setMapOutputKeyClass(Text.class);
                job.setMapOutputValueClass(Text.class);
                job.setInputFormatClass(KeyValueTextInputFormat.class);                Path inputPath = new Path("E:\\sxt\\1-MapReduce\\data\\pagerank.txt");
                if (runCount > ) {
                    inputPath = new Path("E:\\sxt\\1-MapReduce\\data\\pagerank" + (runCount - ));
                }
                FileInputFormat.addInputPath(job, inputPath);                Path outPath = new Path("E:\\sxt\\1-MapReduce\\data\\pagerank" + runCount);
                FileSystem fileSystem = FileSystem.get(config);
                if (fileSystem.exists(outPath)) {
                    fileSystem.delete(outPath);
                }
                FileOutputFormat.setOutputPath(job, outPath);
                boolean flag = job.waitForCompletion(true);                if (flag) {
                    // hadoop自己的计数器, 使用枚举作为标记
                    long value = job.getCounters().findCounter(Counter.Counter).getValue();
                    // 根据pagerank公式, 除以4, 缩小1000倍
                    double avg = value / (4.0 * );
                    if (avg < d) {
                        break;
                    }
                }
            }catch (Exception e) {
                e.printStackTrace();
            }
        }
    }}

最后, 还有初始文档

A    B    D
B    C
C    A    B
D    B    C

注: java的类型转换, 除以int或者乘int类型的结果都将是int类型, 导致reduce失败

个人收藏笔记记录

开通VIP