算法实现
(一)闷梁核凯友心类
Apriori算法的核心实现类为AprioriAlgorithm,实现的Java代码如下所示:
package org.shirdrn.datamining.association
import java.util.HashMap
import java.util.HashSet
import java.util.Iterator
import java.util.Map
import java.util.Set
import java.util.TreeMap
/**
* <B>关联规则挖掘:Apriori算法</B>
*
* <P>该算法基本上蚂孙运按照Apriori算法的基本思想来实现的。
*
* @author shirdrn
* @date 2009/07/22 22:56:23
* @msn shirdrn#hotmail.com(#→@)
* @qq 187071722
*/
public class AprioriAlgorithm {
private Map<Integer, Set<String>>txDatabase// 事务数据库
private Float minSup// 最小支持度
private Float minConf// 最小置信度
private Integer txDatabaseCount// 事务数据库中的事务数
private Map<Integer, Set<Set<String>>>freqItemSet// 频繁项集集合
private Map<Set<String>, Set<Set<String>>>assiciationRules// 频繁关联规则集合
public AprioriAlgorithm(
Map<Integer, Set<String>>txDatabase,
Float minSup,
Float minConf) {
this.txDatabase = txDatabase
this.minSup = minSup
this.minConf = minConf
this.txDatabaseCount = this.txDatabase.size()
freqItemSet = new TreeMap<Integer, Set<Set<String>>>()
assiciationRules = new HashMap<Set<String>, Set<Set<String>>>()
}
/**
* 扫描事务数据库,计算频繁1-项集
* @return
*/
public Map<Set<String>, Float>getFreq1ItemSet() {
Map<Set<String>, Float>freq1ItemSetMap = new HashMap<Set<String>, Float>()
Map<Set<String>, Integer>candFreq1ItemSet = this.getCandFreq1ItemSet()
Iterator<Map.Entry<Set<String>, Integer>>it = candFreq1ItemSet.entrySet().iterator()
while(it.hasNext()) {
Map.Entry<Set<String>, Integer>entry = it.next()
// 计算支持度
Float supported = new Float(entry.getValue().toString())/new Float(txDatabaseCount)
if(supported>=minSup) {
freq1ItemSetMap.put(entry.getKey(), supported)
}
}
return freq1ItemSetMap
}
/**
* 计算候选频繁1-项集
* @return
*/
public Map<Set<String>, Integer>getCandFreq1ItemSet() {
Map<Set<String>, Integer>candFreq1ItemSetMap = new HashMap<Set<String>, Integer>()
Iterator<Map.Entry<Integer, Set<String>>>it = txDatabase.entrySet().iterator()
// 统计支持数,生成候选频繁1-项集
while(it.hasNext()) {
Map.Entry<Integer, Set<String>>entry = it.next()
Set<String>itemSet = entry.getValue()
for(String item : itemSet) {
Set<String>key = new HashSet<String>()
key.add(item.trim())
if(!candFreq1ItemSetMap.containsKey(key)) {
Integer value = 1
candFreq1ItemSetMap.put(key, value)
}
else {
Integer value = 1+candFreq1ItemSetMap.get(key)
candFreq1ItemSetMap.put(key, value)
}
}
}
return candFreq1ItemSetMap
}
/**
* 根据频繁(k-1)-项集计算候选频繁k-项集
*
* @param m 其中m=k-1
* @param freqMItemSet 频繁(k-1)-项集
* @return
*/
public Set<Set<String>>aprioriGen(int m, Set<Set<String>>freqMItemSet) {
Set<Set<String>>candFreqKItemSet = new HashSet<Set<String>>()
Iterator<Set<String>>it = freqMItemSet.iterator()
Set<String>originalItemSet = null
while(it.hasNext()) {
originalItemSet = it.next()
Iterator<Set<String>>itr = this.getIterator(originalItemSet, freqMItemSet)
while(itr.hasNext()) {
Set<String>identicalSet = new HashSet<String>()// 两个项集相同元素的集合(集合的交运算)
identicalSet.addAll(originalItemSet)
Set<String>set = itr.next()
identicalSet.retainAll(set)// identicalSet中剩下的元素是identicalSet与set集合中公有的元素
if(identicalSet.size() == m-1) { // (k-1)-项集中k-2个相同
Set<String>differentSet = new HashSet<String>()// 两个项集不同元素的集合(集合的差运算)
differentSet.addAll(originalItemSet)
differentSet.removeAll(set)// 因为有k-2个相同,则differentSet中一定剩下一个元素,即differentSet大小为1
differentSet.addAll(set)// 构造候选k-项集的一个元素(set大小为k-1,differentSet大小为k)
candFreqKItemSet.add(differentSet)// 加入候选k-项集集合
}
}
}
return candFreqKItemSet
}
/**
* 根据一个频繁k-项集的元素(集合),获取到频繁k-项集的从该元素开始的迭代器实例
* @param itemSet
* @param freqKItemSet 频繁k-项集
* @return
*/
private Iterator<Set<String>>getIterator(Set<String>itemSet, Set<Set<String>>freqKItemSet) {
Iterator<Set<String>>it = freqKItemSet.iterator()
while(it.hasNext()) {
if(itemSet.equals(it.next())) {
break
}
}
return it
}
/**
* 根据频繁(k-1)-项集,调用aprioriGen方法,计算频繁k-项集
*
* @param k
* @param freqMItemSet 频繁(k-1)-项集
* @return
*/
public Map<Set<String>, Float>getFreqKItemSet(int k, Set<Set<String>>freqMItemSet) {
Map<Set<String>, Integer>candFreqKItemSetMap = new HashMap<Set<String>, Integer>()
// 调用aprioriGen方法,得到候选频繁k-项集
Set<Set<String>>candFreqKItemSet = this.aprioriGen(k-1, freqMItemSet)
// 扫描事务数据库
Iterator<Map.Entry<Integer, Set<String>>>it = txDatabase.entrySet().iterator()
// 统计支持数
while(it.hasNext()) {
Map.Entry<Integer, Set<String>>entry = it.next()
Iterator<Set<String>>kit = candFreqKItemSet.iterator()
while(kit.hasNext()) {
Set<String>kSet = kit.next()
Set<String>set = new HashSet<String>()
set.addAll(kSet)
set.removeAll(entry.getValue())// 候选频繁k-项集与事务数据库中元素做差元算
if(set.isEmpty()) { // 如果拷贝set为空,支持数加1
if(candFreqKItemSetMap.get(kSet) == null) {
Integer value = 1
candFreqKItemSetMap.put(kSet, value)
}
else {
Integer value = 1+candFreqKItemSetMap.get(kSet)
candFreqKItemSetMap.put(kSet, value)
}
}
}
}
// 计算支持度,生成频繁k-项集,并返回
return support(candFreqKItemSetMap)
}
/**
* 根据候选频繁k-项集,得到频繁k-项集
*
* @param candFreqKItemSetMap 候选k项集(包含支持计数)
*/
public Map<Set<String>, Float>support(Map<Set<String>, Integer>candFreqKItemSetMap) {
Map<Set<String>, Float>freqKItemSetMap = new HashMap<Set<String>, Float>()
Iterator<Map.Entry<Set<String>, Integer>>it = candFreqKItemSetMap.entrySet().iterator()
while(it.hasNext()) {
Map.Entry<Set<String>, Integer>entry = it.next()
// 计算支持度
Float supportRate = new Float(entry.getValue().toString())/new Float(txDatabaseCount)
if(supportRate<minSup) { // 如果不满足最小支持度,删除
it.remove()
}
else {
freqKItemSetMap.put(entry.getKey(), supportRate)
}
}
return freqKItemSetMap
}
/**
* 挖掘全部频繁项集
*/
public void mineFreqItemSet() {
// 计算频繁1-项集
Set<Set<String>>freqKItemSet = this.getFreq1ItemSet().keySet()
freqItemSet.put(1, freqKItemSet)
// 计算频繁k-项集(k>1)
int k = 2
while(true) {
Map<Set<String>, Float>freqKItemSetMap = this.getFreqKItemSet(k, freqKItemSet)
if(!freqKItemSetMap.isEmpty()) {
this.freqItemSet.put(k, freqKItemSetMap.keySet())
freqKItemSet = freqKItemSetMap.keySet()
}
else {
break
}
k++
}
}
/**
* <P>挖掘频繁关联规则
* <P>首先挖掘出全部的频繁项集,在此基础上挖掘频繁关联规则
*/
public void mineAssociationRules() {
freqItemSet.remove(1)// 删除频繁1-项集
Iterator<Map.Entry<Integer, Set<Set<String>>>>it = freqItemSet.entrySet().iterator()
while(it.hasNext()) {
Map.Entry<Integer, Set<Set<String>>>entry = it.next()
for(Set<String>itemSet : entry.getValue()) {
// 对每个频繁项集进行关联规则的挖掘
mine(itemSet)
}
}
}
/**
* 对从频繁项集集合freqItemSet中每迭代出一个频繁项集元素,执行一次关联规则的挖掘
* @param itemSet 频繁项集集合freqItemSet中的一个频繁项集元素
*/
public void mine(Set<String>itemSet) {
int n = itemSet.size()/2// 根据集合的对称性,只需要得到一半的真子集
for(int i=1i<=ni++) {
// 得到频繁项集元素itemSet的作为条件的真子集集合
Set<Set<String>>properSubset = ProperSubsetCombination.getProperSubset(i, itemSet)
// 对条件的真子集集合中的每个条件项集,获取到对应的结论项集,从而进一步挖掘频繁关联规则
for(Set<String>conditionSet : properSubset) {
Set<String>conclusionSet = new HashSet<String>()
conclusionSet.addAll(itemSet)
conclusionSet.removeAll(conditionSet)// 删除条件中存在的频繁项
confide(conditionSet, conclusionSet)// 调用计算置信度的方法,并且挖掘出频繁关联规则
}
}
}
/**
* 对得到的一个条件项集和对应的结论项集,计算该关联规则的支持计数,从而根据置信度判断是否是频繁关联规则
* @param conditionSet 条件频繁项集
* @param conclusionSet 结论频繁项集
*/
public void confide(Set<String>conditionSet, Set<String>conclusionSet) {
// 扫描事务数据库
Iterator<Map.Entry<Integer, Set<String>>>it = txDatabase.entrySet().iterator()
// 统计关联规则支持计数
int conditionToConclusionCnt = 0// 关联规则(条件项集推出结论项集)计数
int conclusionToConditionCnt = 0// 关联规则(结论项集推出条件项集)计数
int supCnt = 0// 关联规则支持计数
while(it.hasNext()) {
Map.Entry<Integer, Set<String>>entry = it.next()
Set<String>txSet = entry.getValue()
Set<String>set1 = new HashSet<String>()
Set<String>set2 = new HashSet<String>()
set1.addAll(conditionSet)
set1.removeAll(txSet)// 集合差运算:set-txSet
if(set1.isEmpty()) { // 如果set为空,说明事务数据库中包含条件频繁项conditionSet
// 计数
conditionToConclusionCnt++
}
set2.addAll(conclusionSet)
set2.removeAll(txSet)// 集合差运算:set-txSet
if(set2.isEmpty()) { // 如果set为空,说明事务数据库中包含结论频繁项conclusionSet
// 计数
conclusionToConditionCnt++
}
if(set1.isEmpty() &&set2.isEmpty()) {
supCnt++
}
}
// 计算置信度
Float conditionToConclusionConf = new Float(supCnt)/new Float(conditionToConclusionCnt)
if(conditionToConclusionConf>=minConf) {
if(assiciationRules.get(conditionSet) == null) { // 如果不存在以该条件频繁项集为条件的关联规则
Set<Set<String>>conclusionSetSet = new HashSet<Set<String>>()
conclusionSetSet.add(conclusionSet)
assiciationRules.put(conditionSet, conclusionSetSet)
}
else {
assiciationRules.get(conditionSet).add(conclusionSet)
}
}
Float conclusionToConditionConf = new Float(supCnt)/new Float(conclusionToConditionCnt)
if(conclusionToConditionConf>=minConf) {
if(assiciationRules.get(conclusionSet) == null) { // 如果不存在以该结论频繁项集为条件的关联规则
Set<Set<String>>conclusionSetSet = new HashSet<Set<String>>()
conclusionSetSet.add(conditionSet)
assiciationRules.put(conclusionSet, conclusionSetSet)
}
else {
assiciationRules.get(conclusionSet).add(conditionSet)
}
}
}
/**
* 经过挖掘得到的频繁项集Map
*
* @return 挖掘得到的频繁项集集合
*/
public Map<Integer, Set<Set<String>>>getFreqItemSet() {
return freqItemSet
}
/**
* 获取挖掘到的全部的频繁关联规则的集合
* @return 频繁关联规则集合
*/
public Map<Set<String>, Set<Set<String>>>getAssiciationRules() {
return assiciationRules
}
}
(二)辅助类
ProperSubsetCombination类是一个辅助类,在挖掘频繁关联规则的过程中,用于生成一个频繁项集元素的非空真子集,实现代码如下:
package org.shirdrn.datamining.association
import java.util.BitSet
import java.util.HashSet
import java.util.Set
/**
* <B>求频繁项集元素(集合)的非空真子集集合</B>
* <P>从一个集合(大小为n)中取出m(m属于2~n/2的闭区间)个元素的组合实现类,获取非空真子集的集合
*
* @author shirdrn
* @date 2009/07/22 22:56:23
* @msn shirdrn#hotmail.com(#→@)
* @qq 187071722
*/
public class ProperSubsetCombination {
private static String[] array
private static BitSet startBitSet// 比特集合起始状态
private static BitSet endBitSet// 比特集合终止状态,用来控制循环
private static Set<Set<String>>properSubset// 真子集集合
/**
* 计算得到一个集合的非空真子集集合
*
* @param n 真子集的大小
* @param itemSet 一个频繁项集元素
* @return 非空真子集集合
*/
public static Set<Set<String>>getProperSubset(int n, Set<String>itemSet) {
String[] array = new String[itemSet.size()]
ProperSubsetCombination.array = itemSet.toArray(array)
properSubset = new HashSet<Set<String>>()
startBitSet = new BitSet()
endBitSet = new BitSet()
// 初始化startBitSet,左侧占满1
for (int i=0i<ni++) {
startBitSet.set(i, true)
}
// 初始化endBit,右侧占满1
for (int i=array.length-1i>=array.length-ni--) {
endBitSet.set(i, true)
}
// 根据起始startBitSet,将一个组合加入到真子集集合中
get(startBitSet)
while(!startBitSet.equals(endBitSet)) {
int zeroCount = 0// 统计遇到10后,左边0的个数
int oneCount = 0// 统计遇到10后,左边1的个数
int pos = 0// 记录当前遇到10的索引位置
// 遍历startBitSet来确定10出现的位置
for (int i=0i<array.lengthi++) {
if (!startBitSet.get(i)) {
zeroCount++
}
if (startBitSet.get(i) &&!startBitSet.get(i+1)) {
pos = i
oneCount = i - zeroCount
// 将10变为01
startBitSet.set(i, false)
startBitSet.set(i+1, true)
break
}
}
// 将遇到10后,左侧的1全部移动到最左侧
int counter = Math.min(zeroCount, oneCount)
int startIndex = 0
int endIndex = 0
if(pos>1 &&counter>0) {
pos--
endIndex = pos
for (int i=0i<counteri++) {
startBitSet.set(startIndex, true)
startBitSet.set(endIndex, false)
startIndex = i+1
pos--
if(pos>0) {
endIndex = pos
}
}
}
get(startBitSet)
}
return properSubset
}
/**
* 根据一次移位 *** 作得到的startBitSet,得到一个真子集
* @param bitSet
*/
private static void get(BitSet bitSet) {
Set<String>set = new HashSet<String>()
for(int i=0i<array.lengthi++) {
if(bitSet.get(i)) {
set.add(array[i])
}
}
properSubset.add(set)
}
}
测试用例
对上述Apriori算法的实现进行了简单的测试,测试用例如下所示:
package org.shirdrn.datamining.association
import java.util.HashMap
import java.util.Map
import java.util.Set
import java.util.TreeSet
import org.shirdrn.datamining.association.AprioriAlgorithm
import junit.framework.TestCase
/**
* <B>Apriori算法测试类</B>
*
* @author shirdrn
* @date 2009/07/22 22:56:23
* @msn shirdrn#hotmail.com(#→@)
* @qq 187071722
*/
public class TestAprioriAlgorithm extends TestCase {
private AprioriAlgorithm apriori
private Map<Integer, Set<String>>txDatabase
private Float minSup = new Float("0.50")
private Float minConf = new Float("0.70")
@Override
protected void setUp() throws Exception {
create()// 构造事务数据库
apriori = new AprioriAlgorithm(txDatabase, minSup, minConf)
}
/**
* 构造模拟事务数据库txDatabase
*/
public void create() {
txDatabase = new HashMap<Integer, Set<String>>()
Set<String>set1 = new TreeSet<String>()
set1.add("A")
set1.add("B")
set1.add("C")
set1.add("E")
txDatabase.put(1, set1)
Set<String>set2 = new TreeSet<String>()
set2.add("A")
set2.add("B")
set2.add("C")
txDatabase.put(2, set2)
Set<String>set3 = new TreeSet<String>()
set3.add("C")
set3.add("D")
txDatabase.put(3, set3)
Set<String>set4 = new TreeSet<String>()
set4.add("A")
set4.add("B")
set4.add("E")
txDatabase.put(4, set4)
}
/**
* 测试挖掘频繁1-项集
*/
public void testFreq1ItemSet() {
System.out.println("挖掘频繁1-项集 : " + apriori.getFreq1ItemSet())
}
/**
* 测试aprioriGen方法,生成候选频繁项集
*/
public void testAprioriGen() {
System.out.println(
"候选频繁2-项集 : " +
this.apriori.aprioriGen(1, this.apriori.getFreq1ItemSet().keySet())
)
}
/**
* 测试挖掘频繁2-项集
*/
public void testGetFreq2ItemSet() {
System.out.println(
"挖掘频繁2-项集 :" +
this.apriori.getFreqKItemSet(2, this.apriori.getFreq1ItemSet().keySet())
)
}
/**
* 测试挖掘频繁3-项集
*/
public void testGetFreq3ItemSet() {
System.out.println(
"挖掘频繁3-项集 :" +
this.apriori.getFreqKItemSet(
3,
this.apriori.getFreqKItemSet(2, this.apriori.getFreq1ItemSet().keySet()).keySet()
)
)
}
/**
* 测试挖掘全部频繁项集
*/
public void testGetFreqItemSet() {
this.apriori.mineFreqItemSet()// 挖掘频繁项集
System.out.println("挖掘频繁项集 :" + this.apriori.getFreqItemSet())
}
/**
* 测试挖掘全部频繁关联规则
*/
public void testMineAssociationRules() {
this.apriori.mineFreqItemSet()// 挖掘频繁项集
this.apriori.mineAssociationRules()
System.out.println("挖掘频繁关联规则 :" + this.apriori.getAssiciationRules())
}
}
测试结果:
挖掘频繁1-项集 : {[E]=0.5, [A]=0.75, [B]=0.75, [C]=0.75}
候选频繁2-项集 : [[E, C], [A, B], [B, C], [A, C], [E, B], [E, A]]
挖掘频繁2-项集 :{[A, B]=0.75, [B, C]=0.5, [A, C]=0.5, [E, B]=0.5, [E, A]=0.5}
挖掘频繁3-项集 :{[E, A, B]=0.5, [A, B, C]=0.5}
挖掘频繁项集 :{1=[[E], [A], [B], [C]], 2=[[A, B], [B, C], [A, C], [E, B], [E, A]], 3=[[E, A, B], [A, B, C]]}
挖掘频繁关联规则 :{[E]=[[A], [B], [A, B]], [A]=[[B]], [B]=[[A]], [B, C]=[[A]], [A, C]=[[B]], [E, B]=[[A]], [E, A]=[[B]]}
从测试结果看到,使用Apriori算法挖掘得到的全部频繁项集为:
{1=[[E], [A], [B], [C]], 2=[[A, B], [B, C], [A, C], [E, B], [E, A]], 3=[[E, A, B], [A, B, C]]}
使用Apriori算法挖掘得到的全部频繁关联规则为:
{E}→{A}、{E}→{B}、{E}→{A,B}、{A}→{B}、{B}→{A}、{B,C}→{A}、{A,C}→{B}、{B,E}→{A}、{A,E}→{B}。
http://www.csc.liv.ac.uk/~frans/Notes/KDD/AssocRuleMine/apriori.html.h
====================================
/*----------------------------------------------------------------------
File: apriori.h
Contents: apriori algorithm for finding frequent item sets
(specialized version for FIMI 2003 workshop)
Author : Christian Borgelt
History : 15.08.2003 file created from normal apriori.c
16.08.2003 parameter for transaction filtering added
18.08.2003 dynamic filtering decision based on times added
21.08.2003 transaction sort changed to heapsort
20.09.2003 output file made optional
----------------------------------------------------------------------*/
/*
Modified by : Frédéric Flouvat
Modifications : store the positive and negative border into an
an input trie for ABS
process stastical informations on dataset to stop
the apriori classical iterations
Author : Frédéric Flouvat
----------------------------------------------------------------------*/
#ifndef APRIRORI_H
#define APRIRORI_H
#include <猜颤iostream>
using namespace std
#define MAXIMAL
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <帆兆迟time.h>
#include <assert.h>
#include "tract.h"
#include "istree.h"
#include "Application.h"
/*----------------------------------------------------------------------
Preprocessor Definitions
----------------------------------------------------------------------*/
#define PRGNAME "fim/apriori"
#define DESCRIPTION "frequent item sets miner for FIMI 2003"
#define VERSION "version 1.7 (2003.12.02) " \
"态李(c) 2003 Christian Borgelt"
/* --- error codes --- */
#define E_OPTION(-5)/* unknown option */
#define E_OPTARG(-6)/* missing option argument */
#define E_ARGCNT(-7)/* too few/many arguments */
#define E_SUPP (-8)/* invalid minimum support */
#define E_NOTAS (-9)/* no items or transactions */
#define E_UNKNOWN (-18)/* unknown error */
#ifndef QUIET /* if not quiet version */
#define MSG(x)x /* print messages */
#else /* if quiet version */
#define MSG(x) /* suppress messages */
#endif
#define SEC_SINCE(t) ((clock()-(t)) /(double)CLOCKS_PER_SEC)
#define RECCNT(s) (tfs_reccnt(is_tfscan(s)) \
+ ((tfs_delim(is_tfscan(s)) == TFS_REC) ? 0 : 1))
#define BUFFER(s) tfs_buf(is_tfscan(s))
/*----------------------------------------------------------------------
Constants
----------------------------------------------------------------------*/
#ifndef QUIET /* if not quiet version */
/* --- error messages --- */
static const char *errmsgs[] = {
/* E_NONE 0 */ "no error\n",
/* E_NOMEM-1 */ "not enough memory\n",
/* E_FOPEN-2 */ "cannot open file %s\n",
/* E_FREAD-3 */ "read error on file %s\n",
/* E_FWRITE -4 */ "write error on file %s\n",
/* E_OPTION -5 */ "unknown option -%c\n",
/* E_OPTARG -6 */ "missing option argument\n",
/* E_ARGCNT -7 */ "wrong number of arguments\n",
/* E_SUPP -8 */ "invalid minimal support %d\n",
/* E_NOTAS-9 */ "no items or transactions to work on\n",
/*-10 to -15 */ NULL, NULL, NULL, NULL, NULL, NULL,
/* E_ITEMEXP -16 */ "file %s, record %d: item expected\n",
/* E_DUPITEM -17 */ "file %s, record %d: duplicate item %s\n",
/* E_UNKNOWN -18 */ "unknown error\n"
}
#endif
/*----------------------------------------------------------------------
Global Variables
----------------------------------------------------------------------*/
#ifndef QUIET
static char*prgname /* program name for error messages */
#endif
static ITEMSET *itemset = NULL/* item set */
static TASET *taset = NULL/* transaction set */
static TATREE *tatree = NULL/* transaction tree */
static ISTREE *istree = NULL/* item set tree */
static FILE*in = NULL/* input file */
static FILE*out = NULL/* output file */
extern "C" TATREE * apriori( char*fn_in, char*fn_out, int supp, int &level,
Trie * bdPapriori, Trie * bdn, set<Element>* relist, double ratioNfC, double &eps, int ismax,
vector<unsigned int >* stat, int &maxBdP, bool &generatedFk, bool verbose )
#endif
.c
============================================
/*----------------------------------------------------------------------
File: apriori.c
Contents: apriori algorithm for finding frequent item sets
(specialized version for FIMI 2003 workshop)
Author : Christian Borgelt
History : 15.08.2003 file created from normal apriori.c
16.08.2003 parameter for transaction filtering added
18.08.2003 dynamic filtering decision based on times added
21.08.2003 transaction sort changed to heapsort
20.09.2003 output file made optional
----------------------------------------------------------------------*/
/*
Modified by : Frédéric Flouvat
Modifications : store the positive and negative border into an
an input trie for ABS
process stastical informations on dataset to stop
the apriori classical iterations
Author : Frédéric Flouvat
----------------------------------------------------------------------*/
#include "apriori.h"
/*----------------------------------------------------------------------
Main Functions
----------------------------------------------------------------------*/
static void error (int code, ...)
{ /* --- print an error message */
#ifndef QUIET /* if not quiet version */
va_listargs /* list of variable arguments */
const char *msg /* error message */
assert(prgname) /* check the program name */
if (code <E_UNKNOWN) code = E_UNKNOWN
if (code <0) { /* if to report an error, */
msg = errmsgs[-code] /* get the error message */
if (!msg) msg = errmsgs[-E_UNKNOWN]
fprintf(stderr, "\n%s: ", prgname)
va_start(args, code) /* get variable arguments */
vfprintf(stderr, msg, args)/* print error message */
va_end(args) /* end argument evaluation */
}
#endif
#ifndef NDEBUG/* if debug version */
if (istree) ist_delete(istree)
if (tatree) tat_delete(tatree)
if (taset) tas_delete(taset, 0)
if (itemset) is_delete(itemset)
if (in) fclose(in) /* clean up memory */
if (out) fclose(out)/* and close files */
#endif
exit(code) /* abort the program */
} /* error() */
/*--------------------------------------------------------------------*/
TATREE * apriori( char*fn_in, char*fn_out, int supp, int &level, Trie * bdPapriori,
Trie * bdn , set<Element>* relist , double ratioNfC, double &eps,int ismax,
vector<unsigned int >* stat, int &maxBdP, bool &generatedFk, bool verbose )
{
int i, k, n /* loop variables, counters */
int tacnt = 0 /* number of transactions */
int max = 0 /* maximum transaction size */
int empty = 1 /* number of empty item sets */
int *map, *set /* identifier map, item set */
char*usage /* flag vector for item usage */
clock_t t, tt, tc, x/* timer for measurements */
double actNfC = 1
double avgNfC = 0
int nbgen = 0
int nbfreq = 0
level = 1
bool endApriori = false // boolean to stop the initial classial apriori approach
int bdnsize = 0// number of itemsets found infrequent
/* --- create item set and transaction set --- */
itemset = is_create() /* create an item set and */
if (!itemset) error(E_NOMEM)/* set the special characters */
taset = tas_create(itemset) /* create a transaction set */
if (!taset) error(E_NOMEM) /* to store the transactions */
if( verbose ) MSG(fprintf(stderr, "\n")) /* terminate the startup message */
/* --- read transactions --- */
if( verbose )MSG(fprintf(stderr, "reading %s ... ", fn_in))
t = clock()/* start the timer and */
in = fopen(fn_in, "r") /* open the input file */
if (!in) error(E_FOPEN, fn_in)
for (tacnt = 01tacnt++) { /* transaction read loop */
k = is_read(itemset, in) /* read the next transaction */
if (k <0) error(k, fn_in, RECCNT(itemset), BUFFER(itemset))
if (k >0) break /* check for error and end of file */
k = is_tsize(itemset) /* update the maximal */
if (k >max) max = k /* transaction size */
if (taset &&(tas_add(taset, NULL, 0) != 0))
error(E_NOMEM) /* add the loaded transaction */
} /* to the transaction set */
fclose(in)in = NULL /* close the input file */
n = is_cnt(itemset)/* get the number of items */
if( verbose ) MSG(fprintf(stderr, "[%d item(s),", n))
if( verbose ) MSG(fprintf(stderr, " %d transaction(s)] done ", tacnt))
if( verbose ) MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)))
/* --- sort and recode items --- */
if( verbose ) MSG(fprintf(stderr, "sorting and recoding items ... "))
t = clock() /* start the timer */
map = (int*)malloc(is_cnt(itemset) *sizeof(int))
if (!map) error(E_NOMEM)/* create an item identifier map */
n = is_recode(itemset, supp, 2, map) /* 2: sorting mode */
tas_recode(taset, map, n) /* recode the loaded transactions */
max = tas_max(taset)/* get the new maximal t.a. size */
// use in the other part of the implementation to have the corresponding
// identifiant to an internal id
stat->reserve( n+2 )
stat->push_back( 0 )
for(int j= 0j<n j++ )
{
stat->push_back( 0 )
relist->insert( Element( atoi( is_name( itemset, j ) ) ,j) )
}
if( verbose ) MSG(fprintf(stderr, "[%d item(s)] ", n))
if( verbose ) MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)))
/* --- create a transaction tree --- */
if( verbose ) MSG(fprintf(stderr, "creating transaction tree ... "))
t = clock() /* start the timer */
tatree = tat_create(taset,1)/* create a transaction tree */
if (!tatree) error(E_NOMEM) /* (compactify transactions) */
tt = clock() -t /* note the construction time */
if( verbose ) MSG(fprintf(stderr, "done [%.2fs].\n", SEC_SINCE(t)))
/* --- create an item set tree --- */
if( verbose ) MSG(fprintf(stderr, "checking subsets of size 1"))
t = clock()tc = 0 /* start the timer and */
istree = ist_create(n, supp)/* create an item set tree */
if (!istree) error(E_NOMEM)
for (k = n--k >= 0) /* set single item frequencies */
ist_setcnt(istree, k, is_getfrq(itemset, k))
ist_settac(istree, tacnt) /* set the number of transactions */
usage = (char*)malloc(n *sizeof(char))
if (!usage) error(E_NOMEM) /* create a item usage vector */
/* --- check item subsets --- */
while (ist_height(istree) <max &&( ( ismax == -1 && endApriori == false )
|| ist_height(istree) <ismax )
)
{
nbgen = 0
nbfreq = 0
level ++
i = ist_check(istree,usage)/* check current item usage */
if (i <max) max = i /* update the maximum set size */
if (ist_height(istree) >= i) break
k = ist_addlvl(istree, nbgen)/* while max. height is not reached, */
if (k < 0) error(E_NOMEM)/* add a level to the item set tree */
if (k != 0) break /* if no level was added, abort */
if( verbose ) MSG(fprintf(stderr, " %d", ist_height(istree)))
if ((i <n) /* check item usage on current level */
&& (i *(double)tt <0.1 *n *tc)) {
n = ix = clock() /* if items were removed and */
tas_filter(taset, usage)/* the counting time is long enough, */
tat_delete(tatree) /* remove unnecessary items */
tatree = tat_create(taset, 1)
if (!tatree) error(E_NOMEM)
tt = clock() -x /* rebuild the transaction tree and */
} /* note the new construction time */
x = clock() /* start the timer */
ist_countx(istree, tatree, nbfreq, istree->supp )/* count the transaction tree */
tc = clock() -x /* in the item set tree */
actNfC = 1-double(nbfreq)/double(nbgen)
avgNfC = avgNfC + actNfC
if( verbose )
{
cout<<" \t Fk : "<<nbfreq<<" Ck : "<<nbgen<<" NFk/Ck "<<actNfC<<" avg NFk/Ck "<<avgNfC/(level-1)<<endl
}
bdnsize += nbgen - nbfreq
if( level >=4 &&( bdnsize / nbgen <1.5 ) &&( bdnsize >100 ) )
{
if( actNfC <ratioNfC )
{
eps = 0
endApriori = true
}
else if( actNfC >0.25 )
endApriori = true
}
} /* and note the new counting time */
if( verbose ) MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)))
/* --- filter item sets --- */
t = clock() /* start the timer */
#ifdef MAXIMAL/* filter maximal item sets */
if( verbose ) MSG(fprintf(stderr, "filtering maximal item sets ... "))
if( ratioNfC == 0 || nbgen <k+1 || ist_height(istree)>= max )
ist_filter2(istree, IST_MAXFRQ, 0)
else
ist_filter2(istree, IST_MAXFRQ, bdn)
if( verbose ) MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)))
empty = (n <= 0) ? 1 : 0/* check whether the empty item set */
#endif/* is maximal */
#ifdef CLOSED /* filter closed item sets */
if( verbose ) MSG(fprintf(stderr, "filtering closed item sets ... "))
ist_filter(istree, IST_CLOSED)
if( verbose ) MSG(fprintf(stderr, " done [%.2fs].\n", SEC_SINCE(t)))
for (k = n--k >= 0) /* check for an item in all t.a. */
if (is_getfrq(itemset, k) == tacnt) break
empty = (k <= 0) ? 1 : 0/* check whether the empty item set */
#endif/* is closed */
/* --- print item sets --- */
for (i = ist_height(istree)--i >= 0)
map[i] = 0/* clear the item set counters */
if( verbose ) MSG(fprintf(stderr, "writing %s ... ", (fn_out) ? fn_out : "<none>"))
t = clock() /* start the timer and */
if (fn_out) { /* if an output file is given, */
out = fopen(fn_out, "w") /* open the output file */
if (!out) error(E_FOPEN, fn_out)
if (empty) fprintf(out, " (%d)\n", tacnt)
} /* report empty item set */
ist_init(istree)/* init. the item set extraction */
set = is_tract(itemset) /* get the transaction buffer */
for (n = empty1n++) { /* extract item sets from the tree */
k = ist_set(istree, set, &supp)
if (k <= 0) break /* get the next frequent item set */
map[k-1]++/* count the item set */
if (fn_out) { /* if an output file is given */
for (i = 0i <ki++) { /* traverse the items */
fputs(is_name(itemset, set[i]), out)
fputc(' ', out) /* print the name of the next item */
} /* followed by a separator */
fprintf(out, "(%d)\n", supp)
} /* print the item set's support */
else
{
short unsigned * is = new short unsigned[k]
for (i = 0i <ki++) /* traverse the items */
{
is[i] = set[i]
}
if( k <level || nbgen <k+1 || ist_height(istree)>= max )
{
bdPapriori->insert(is, k ,supp )
(*stat)[ 0 ] ++
(*stat)[ k+1 ]++
if( maxBdP <k )
maxBdP = k
}
else
{
generatedFk = true
}
delete[] is
}
}
if (fn_out) { /* if an output file is given */
if (fflush(out) != 0) error(E_FWRITE, fn_out)
if (out != stdout) fclose(out)
out = NULL/* close the output file */
}
if( verbose ) MSG(fprintf(stderr, "[%d set(s)] done ", n))
if( verbose ) MSG(fprintf(stderr, "[%.2fs].\n", SEC_SINCE(t)))
/* --- print item set statistics --- */
k = ist_height(istree) /* find last nonzero counter */
if ((k >0) &&(map[k-1] <= 0)) k--
if( verbose ){
printf("%d\n", empty) /* print the numbers of item sets */
for (i = 0i <ki++) printf("%d\n", map[i])
}
/* --- clean up --- */
#ifndef NDEBUG/* if this is a debug version */
free(usage) /* delete the item usage vector */
free(map) /* and the identifier map */
ist_delete(istree) /* delete the item set tree, */
if (taset) tas_delete(taset, 0) /* the transaction set, */
is_delete(itemset)/* and the item set */
#endif
return tatree
}
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)