mapreduce中的wordcount将结果降序输出_软件运维

这个结果的排序是在sort阶段，根据key来进行排序，Hadoop默认采用的是基本排序策略，你要想实现自己的排序算法，得要自定义===》这是大体的思路。

下面是一个自定义的key2 ；

public static class MyText implements WritableComparable {

private String key = ""

private int value = 0

public MyText() {

}

public MyText(String key, int value) {

this.key = key

this.value = value

}

@Override

public void write(DataOutput out) throws IOException {

out.writeUTF(key)

out.writeInt(value)

}

@Override

public void readFields(DataInput in) throws IOException {

key = in.readUTF()

value = in.readInt()

}

@Override

public int compareTo(Object o) {

MyText other = (MyText) o

return -this.key.compareTo(other.key)

}

public int CompareToValue(Object o) {

MyText other = (MyText) o

return this.value - other.value

}

public String getKey() {

return key

}

public void setKey(String key) {

this.key = key

}

public int getValue() {

return value

}

public void setValue(int value) {

this.value = value

}

这样就可以根据compareTo方法实现倒序排列，根据CompareToValue方法实现判断值的大小，你再reduce端进行下改写，在cleanup的时候写出就可以了~

Hadoop1小组回答，仅供参考......

刚用C语言写的，功能实现，但有很多可以优化的地方，自己改吧，不明白的追问：

#include <stdio.h>

#include <string.h>

//能统计的最大单词个数，可以自己改

#define MAX_WORD_COUNT 500

//结构体，保存每个单词及对应的个数

typedef struct WordCount

{

char cWord[20]

int iCount

}T_WordCount

int CalcEachWord(const char *pText)//计算单词个数及输出信息等

void LowerText(char *pText)//把单词变成小写形式

void SwapItem(T_WordCount *ItemA, T_WordCount * ItemB)//交换两个元素

void SortWord(T_WordCount *pWordSet)//排序

int main(int argc, char *argv[])

{

//测试文本

char pText[] ="Text HAs HAS ONE h-as MOrE Has MORE ha-S BLANk more blank or more oR blank Between wor-ds."

printf("The text is :\n")

printf("----------------------------------\n")

printf("%s\n", pText)

printf("----------------------------------\n")

printf("The top 5 words is :\n")

CalcEachWord(pText) return 0

}

int CalcEachWord(const char *pText)

{

char cTmp[20] = {0}

int i = 0

char *pTmp = cTmp

int iFlag = 0

T_WordCount tWordSet[MAX_WORD_COUNT]

memset(tWordSet, 0, sizeof(tWordSet))

while (*pText != '\0')

{

if ((*pText >= 'A' && *pText <= 'Z') || (*pText >= 'a' && *pText <= 'z'))

{

*pTmp = *pText

pTmp++

}

else if (*pText == '-')

{

++pText

continue

}

else

{

if (strlen(cTmp) > 0)

{

LowerText(cTmp)

iFlag = 0

for (i = 0 i < MAX_WORD_COUNT ++i)

{

if (strlen(tWordSet[i].cWord) > 0)

{

if (strcmp(tWordSet[i].cWord, cTmp) == 0)

{

iFlag = 1

tWordSet[i].iCount++

break

}

else

{

strcpy(tWordSet[i].cWord, cTmp)

tWordSet[i].iCount = 1

iFlag = 1

break

}

if (!iFlag)

{

printf("No more space to save word.\n")

}

memset(cTmp, 0, 20)

pTmp = cTmp

}

++pText

}

//排序 SortWord(tWordSet)

for (i = 0 i < 5 ++i)

{

if (strlen(tWordSet[i].cWord) > 0)

{

printf("%s:%d\n",tWordSet[i].cWord,tWordSet[i].iCount)

}

return 0

}

void LowerText(char *pText)

{

char *pTmp = pText

while (*pTmp != '\0')

{

if ((*pTmp >= 'A' && *pTmp <= 'Z'))

{

*pTmp += 32

}

pTmp++ }

}

void SwapItem(T_WordCount *ItemA, T_WordCount * ItemB)

{

T_WordCount Tmp

memset(&Tmp, 0, sizeof(T_WordCount))

strcpy(Tmp.cWord, ItemA->cWord)

Tmp.iCount = ItemA->iCount

strcpy(ItemA->cWord, ItemB->cWord) ItemA->iCount = ItemB->iCount

strcpy(ItemB->cWord, Tmp.cWord) ItemB->iCount = Tmp.iCount

}

//冒泡排序算法

void SortWord(T_WordCount *pWordSet){

int i,j

for (j = 0 j < MAX_WORD_COUNT - 1 j++)

{

for (i = 0 i < MAX_WORD_COUNT - 1 - j i++)

{

if (pWordSet[i].iCount < pWordSet[i+1].iCount)

{

SwapItem(&pWordSet[i], &pWordSet[i+1])

}

测试结果截图：

欢迎分享，转载请注明来源：内存溢出

原文地址: https://outofmemory.cn/yw/12022060.html

mapreduce中的wordcount将结果降序输出

发表评论

评论列表（0条）