NAME

Data::Mining::Apriori - Perl extension for implement the apriori algorithm of data mining.

SYNOPSIS

use strict;
use warnings;
use Data::Mining::Apriori;

# TRANSACTION 103:CEREAL 101:MILK 102:BREAD
#        1101          1        1         0
#        1102          1        0         1
#        1103          1        1         1
#        1104          1        1         1
#        1105          0        1         1
#        1106          1        1         1
#        1107          1        1         1
#        1108          1        0         1
#        1109          1        1         1
#        1110          1        1         1

my $apriori = new Data::Mining::Apriori;

$apriori->{metrics}{minSupport}=0.0155; # The minimum support, default value is 0.01(1%)

$apriori->{metrics}{minConfidence}=0.0155; # The minimum confidence, default value is 0.10(10%)

$apriori->{metrics}{minLift}=1; # The minimum lift(optional)

$apriori->{metrics}{minLeverage}=0; # The minimum leverage(optional)

$apriori->{metrics}{minConviction}=0; # The minimum conviction(optional)

$apriori->{metrics}{minCoverage}=0; # The minimum coverage(optional)

$apriori->{metrics}{minCorrelation}=0; # The minimum correlation(optional)

$apriori->{metrics}{minCosine}=0; # The minimum cosine(optional)

$apriori->{metrics}{minLaplace}=0; # The minimum laplace(optional)

$apriori->{metrics}{minJaccard}=0; # The minimum jaccard(optional)

$apriori->{output}=1;
# The output type (1 - Export to text file delimited by tab; 2 - Export to excel file with chart)(optional)

$apriori->{messages}=1; # A value boolean to display the messages(optional)

$apriori->{itemsKeyDescription}{'101'}='MILK'; # Hash table to add items by key and description
$apriori->{itemsKeyDescription}{102}='BREAD';
$apriori->{itemsKeyDescription}{'103'}='CEREAL';

my@items=(103,101);
$apriori->insert_key_items_transaction(\@items); # Insert key items per transaction
$apriori->insert_key_items_transaction([103,102]);
$apriori->insert_key_items_transaction([103,101,102]);
$apriori->insert_key_items_transaction([103,101,102]);
$apriori->insert_key_items_transaction([101,102]);
$apriori->insert_key_items_transaction([103,101,102]);
$apriori->insert_key_items_transaction([103,101,102]);
$apriori->insert_key_items_transaction([103,102]);
$apriori->insert_key_items_transaction([103,101,102]);
$apriori->insert_key_items_transaction([103,101,102]);

# or from a data file

$apriori->input_data_file("datafile.txt",",");
# Insert items per line(transaction), accepts the arguments of path to data file and item separator

# file contents (example)

103,101
103,102
103,101,102
103,101,102
101,102
103,101,102
103,101,102
103,102
103,101,102
103,101,102

print "\n${\$apriori->quantity_possible_rules}"; # Show the quantity of possible rules

$apriori->{limitRules}=10; # The limit of rules

$apriori->generate_rules;
# Generate association rules to no longer meet the minimum support, confidence, lift, leverage, conviction, coverage or limit of rules

print "\n@{$apriori->{frequentItemset}}\n"; # Show frequent items

#output messages

12
3 items, 12 possible rules
Large itemset size 2, 3 items
Processing...
Frequent itemset: { 103, 102, 101 }, 3 items
Exporting to excel "output_large_itemset_size_2.xlsx"...
Large itemset size 3, 3 items
Processing...
Frequent itemset: { 103, 101, 102 }, 3 items
Exporting to excel "output_large_itemset_size_3.xlsx"...
103, 101, 102

#output file "output_itemset_size_2.txt"

Rules	Support	Confidence	Lift	Leverage	Conviction	Coverage	Correlation	Cosine	Laplace	Jaccard
R1	0.7000	0.7778	1,1111	0,0700	1,3500	0,9000	0,5092	0,8819	0,5862	0,7778
R2	0.8000	0.8889	1,1111	0,0800	1,8000	0,9000	0,6667	0,9428	0,6207	0,8889
R3	0.7000	0.8750	1,2500	0,1400	2,4000	0,8000	0,7638	0,9354	0,6071	0,8750
R4	0.7000	0.8750	1,2500	0,1400	2,4000	0,8000	0,7638	0,9354	0,6071	0,8750
R5	0.8000	0.8889	1,1111	0,0800	1,8000	0,9000	0,6667	0,9428	0,6207	0,8889
R6	0.7000	0.7778	1,1111	0,0700	1,3500	0,9000	0,5092	0,8819	0,5862	0,7778

Rule R1: { 102 } => { 101 }
Support: 0.7000
Confidence: 0.7778
Lift: 1,1111
Leverage: 0,0700
Conviction: 1,3500
Coverage: 0,9000
Correlation: 0,5092
Cosine: 0,8819
Laplace: 0,5862
Jaccard: 0,7778
Items:
102 BREAD
101 MILK

to be continued...

#output file "output_itemset_size_3.txt"

Rules	Support	Confidence	Lift	Leverage	Conviction	Coverage	Correlation	Cosine	Laplace	Jaccard
R7	0.6000	0.7500	1,2500	0,1200	1,6000	0,8000	0,6124	0,8660	0,5714	0,7500
R8	0.6000	0.8571	1,4286	0,1800	2,8000	0,7000	0,8018	0,9258	0,5926	0,8571
R9	0.6000	0.8571	1,4286	0,1800	2,8000	0,7000	0,8018	0,9258	0,5926	0,8571
R10	0.6000	0.6667	1,1111	0,0600	1,2000	0,9000	0,4082	0,8165	0,5517	0,6667
R11	0.6000	0.7500	1,2500	0,1200	1,6000	0,8000	0,6124	0,8660	0,5714	0,7500
R12	0.6000	0.6667	1,1111	0,0600	1,2000	0,9000	0,4082	0,8165	0,5517	0,6667

Rule R7: { 101 } => { 102, 103 }
Support: 0.6000
Confidence: 0.7500
Lift: 1,2500
Leverage: 0,1200
Conviction: 1,6000
Coverage: 0,8000
Correlation: 0,6124
Cosine: 0,8660
Laplace: 0,5714
Jaccard: 0,7500
Items:
101 MILK
102 BREAD
103 CEREAL

Rule R8: { 101, 102 } => { 103 }
Support: 0.6000
Confidence: 0.8571
Lift: 1,4286
Leverage: 0,1800
Conviction: 2,8000
Coverage: 0,7000
Correlation: 0,8018
Cosine: 0,9258
Laplace: 0,5926
Jaccard: 0,8571
Items:
101 MILK
102 BREAD
103 CEREAL

to be continued...

# or from a database

# CREATE TABLE dimension_product(
	# product_key INTEGER NOT NULL PRIMARY KEY,
	# product_alternate_key INTEGER NOT NULL,
	# product_name TEXT NOT NULL,
	# price REAL NOT NULL
	# -- ...
# );

# INSERT INTO dimension_product VALUES(1,101,'MILK',10.00);
# INSERT INTO dimension_product VALUES(2,102,'BREAD',10.00);
# INSERT INTO dimension_product VALUES(3,103,'CEREAL',10.00);
# -- ...

# CREATE TABLE fact_sales(
	# sales_order_number INTEGER NOT NULL,
	# sales_order_line_number INTEGER NOT NULL,
	# product_key INTEGER NOT NULL,
	# quantity INTEGER NOT NULL,
	# -- ...
	# PRIMARY KEY(sales_order_number, sales_order_line_number),
	# FOREIGN KEY(product_key) REFERENCES dimension_product(product_key)
# );

# INSERT INTO fact_sales VALUES(1101,1,3,1);
# INSERT INTO fact_sales VALUES(1101,2,1,1);
# INSERT INTO fact_sales VALUES(1102,1,3,1);
# INSERT INTO fact_sales VALUES(1102,2,2,1);
# INSERT INTO fact_sales VALUES(1103,1,1,1);
# INSERT INTO fact_sales VALUES(1103,2,2,1);
# INSERT INTO fact_sales VALUES(1103,3,3,1);
# INSERT INTO fact_sales VALUES(1104,1,1,1);
# INSERT INTO fact_sales VALUES(1104,2,2,1);
# INSERT INTO fact_sales VALUES(1104,3,3,1);
# INSERT INTO fact_sales VALUES(1105,1,1,1);
# INSERT INTO fact_sales VALUES(1105,2,2,1);
# INSERT INTO fact_sales VALUES(1106,1,1,1);
# INSERT INTO fact_sales VALUES(1106,2,2,1);
# INSERT INTO fact_sales VALUES(1106,3,3,1);
# INSERT INTO fact_sales VALUES(1107,1,1,1);
# INSERT INTO fact_sales VALUES(1107,2,2,1);
# INSERT INTO fact_sales VALUES(1107,3,3,1);
# INSERT INTO fact_sales VALUES(1108,1,3,1);
# INSERT INTO fact_sales VALUES(1108,2,2,1);
# INSERT INTO fact_sales VALUES(1109,1,1,1);
# INSERT INTO fact_sales VALUES(1109,2,2,1);
# INSERT INTO fact_sales VALUES(1109,3,3,1);
# INSERT INTO fact_sales VALUES(1110,1,1,1);
# INSERT INTO fact_sales VALUES(1110,2,2,1);
# INSERT INTO fact_sales VALUES(1110,3,3,1);
# -- ...

use DBD::SQLite;
use Data::Mining::Apriori;

my $apriori = new Data::Mining::Apriori;

$apriori->{metrics}{minSupport}=0.0155;

$apriori->{metrics}{minConfidence}=0.0155;

$apriori->{metrics}{minLift}=1;

$apriori->{metrics}{minLeverage}=0;

$apriori->{metrics}{minConviction}=0;

$apriori->{metrics}{minCoverage}=0;

$apriori->{metrics}{minCorrelation}=0;

$apriori->{metrics}{minCosine}=0;

$apriori->{metrics}{minLaplace}=0;

$apriori->{metrics}{minJaccard}=0;

$apriori->{output}=1;

$apriori->{messages}=1;

my $db = DBI->connect('dbi:SQLite:dbname=DW.db','','');

my$sql = qq~
SELECT DISTINCT(fs.sales_order_number)
FROM dimension_product dp
JOIN fact_sales fs ON
dp.product_key = fs.product_key
-- WHERE ...
~;

my$query = $db->prepare($sql);
$query->execute;
my$transactions=$query->fetchall_arrayref;

foreach my$transaction(@$transactions){
	$sql = qq~
		SELECT dp.product_alternate_key, dp.product_name
		FROM dimension_product dp
		JOIN fact_sales fs ON
		dp.product_key = fs.product_key
		WHERE fs.sales_order_number = $$transaction[0];
		-- AND ...
	~;
	$query = $db->prepare($sql);
	$query->execute;
	my@items;
	while(my($key,$description)=$query->fetchrow){
		$apriori->{itemsKeyDescription}{$key}=$description;
		push@items,$key;
	}
	$apriori->insert_key_items_transaction(\@items);
}

print "\n${\$apriori->quantity_possible_rules}";

$apriori->{limitRules}=10;

$apriori->generate_rules;

print "\n@{$apriori->{frequentItemset}}\n";

DESCRIPTION

This module implements the apriori algorithm of data mining.

ATTRIBUTES

totalTransactions

The total number of transactions.

metrics

The type of metrics

  • minSupport - the minimum support, default value is 0.01(1%);

  • minConfidence - the minimum confidence, default value is 0.10(10%);

  • minLift - the minimum lift(optional);

  • minLeverage - the minimum leverage(optional);

  • minConviction - the minimum conviction(optional);

  • minCoverage - the minimum coverage(optional);

  • minCorrelation - the minimum correlation(optional);

  • minCosine - the minimum cosine(optional);

  • minLaplace - the minimum laplace(optional);

  • minJaccard - the minimum jaccard(optional).

limitRules

The limit of rules(optional)

output

The output type:(optional)

  • 1 - Text file delimited by tab;

  • 2 - Excel file with chart.

messages

A value boolean to display the messages(optional)

itemsKeyDescription

Hash table to add items by key and description.

itemsKeyTransactions

Hash table, to add the key items per transaction.

frequentItemset

Frequent itemset.

associationRules

A data structure to store the name of the rule, key items, implication, support, confidence, lift, leverage, conviction, coverage, correlation, cosine, laplace and jaccard.

$self->{associationRules} = {
                            '1' => {
                                   'implication' => '{ 102 } => { 101 }',
                                   'support' => '0.7000',
                                   'correlation' => '0,5092',
                                   'jaccard' => '0,7778',
                                   'confidence' => '0.7778',
                                   'conviction' => '1,3500',
                                   'leverage' => '0,0700',
                                   'laplace' => '0,5862',
                                   'cosine' => '0,8819',
                                   'coverage' => '0,9000',
                                   'lift' => '1,1111',
                                   'items' => [
                                              '102',
                                              '101'
                                              ]
                                    }
                            },
                            # to be continued...

METHODS

new

Creates a new instance of Data::Mining::Apriori.

insert_key_items_transaction(\@items)

Insert key items per transaction. Accepts the following arguments:

  • An array reference to key items.

input_data_file("datafile.txt",",")

Insert items per line(transaction). Accepts the following arguments:

  • Data file;

  • Item separator.

# file contents (example)

103,101
103,102
103,101,102
103,101,102
101,102
103,101,102
103,101,102
103,102
103,101,102
103,101,102

quantity_possible_rules

Returns the quantity of possible rules.

generate_rules

Generate association rules until no set of items meets the minimum support, confidence, lift, leverage, conviction, coverage, correlation, cosine, laplace, jaccard or limit of rules.

association_rules

Generate association rules by size of large itemsets.

AUTHOR

Alex Graciano, <agraciano@cpan.org>

COPYRIGHT AND LICENSE

Copyright (C) 2015-2016 by Alex Graciano

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.12.4 or, at your option, any later version of Perl 5 you may have available.