#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "ppport.h"
#include "const-c.inc"
#include <math.h>
//===================================================================
//
// Constants and Type Declarations
//
//===================================================================
// #define MAX_FEATURES 64 // Number of features to use
#define MAX_FEATURES 8
// #define MIN_EPOCHS 120 // Minimum number of epochs per feature
#define MIN_EPOCHS 500
// #define MAX_EPOCHS 200 // Max epochs per feature
#define MAX_EPOCHS 2000
#define MIN_IMPROVEMENT 0.005 // Minimum improvement required to continue current feature
#define INITx 0.1 // Initialization value for features
#define LRATE 0.001 // Learning rate parameter
#define K 0.015 // Regularization parameter used to minimize over-fitting
//////////////////////////////////////////////////////////////////////////////////////////////////////
typedef unsigned char BYTE;
#define true 1
#define false 0
struct Movie
{
int RatingCount;
int RatingSum;
double RatingAvg;
double PseudoAvg;
};
// PseudoAvg is a weighted average used to deal with small movie counts
struct Customer
{
int RatingCount;
int RatingSum;
};
// int CustomerId;
struct Data
{
int CustId;
short MovieId;
BYTE Rating;
float Cache;
};
// class Engine
int num_customers;
int num_movies;
int num_ratings;
// private:
int RatingCount; // Current number of loaded ratings
struct Data *Ratings; // Array of ratings data
struct Movie *Movies; // Array of movie metrics
struct Customer *Customers; // Array of customer metrics
float **MovieFeatures; // Array of features by movie (using floats to save space)
float **CustFeatures; // Array of features by customer (using floats to save space)
// was private, should be public now
void Engine(int, int, int);
void DestroyEngine();
void set_Movies(int, int, int);
void set_Ratings(int, int, int);
void CalcMetrics();
void CalcFeatures();
inline double PredictRating(short, int);
inline double PredictRating2(short, int, int, float, bool);
// end class Engine
// sdw accessors for Perl
void set_Movies(int movieId, int count, int sum) {
Movies[movieId].RatingCount = count;
Movies[movieId].RatingSum = sum;
}
void set_Ratings(int movieId, int custId, int rating) {
Ratings[RatingCount].MovieId = (short)movieId;
Ratings[RatingCount].CustId = custId;
Ratings[RatingCount].Rating = (BYTE)rating;
Ratings[RatingCount].Cache = 0;
RatingCount++;
}
//===================================================================
//
// Engine Class
//
//===================================================================
//-------------------------------------------------------------------
// Initialization
//-------------------------------------------------------------------
void Engine(int x_num_customers, int x_num_ratings, int x_num_movies) {
int i, f;
num_customers = x_num_customers; // yeah, I know... there's a trick for doing this. don't know it. XXX todo.
num_ratings = x_num_ratings;
num_movies = x_num_movies;
RatingCount = 0;
Ratings = (struct Data*)calloc(sizeof(struct Data), num_ratings);
Movies = (struct Movie*)calloc(sizeof(struct Movie), num_movies);
Customers = (struct Customer*)calloc(sizeof(struct Customer), num_customers);
MovieFeatures = (float **)calloc(sizeof(float *), MAX_FEATURES); // calloc should have these 0 initialized
CustFeatures = (float **)calloc(sizeof(float *), MAX_FEATURES);
for (f=0; f<MAX_FEATURES; f++)
{
MovieFeatures[f] = (float *)calloc(sizeof(float), num_movies);
CustFeatures[f] = (float *)calloc(sizeof(float), num_customers);
for (i=0; i<num_movies; i++) {
MovieFeatures[f][i] = (float)INITx;
}
for (i=0; i<num_customers; i++) {
CustFeatures[f][i] = (float)INITx;
}
}
}
void DestroyEngine() {
free(Ratings);
free(Movies);
free(Customers);
free(MovieFeatures);
free(CustFeatures);
RatingCount = 0;
}
//-------------------------------------------------------------------
// Calculations - This Paragraph contains all of the relevant code
//-------------------------------------------------------------------
//
// CalcMetrics
// - Loop through the history and pre-calculate metrics used in the training
//
void CalcMetrics()
{
int i, cid;
// IdItr itr;
wprintf(L"\nCalculating intermediate metrics\n");
// Process each row in the training set
for (i=0; i<RatingCount; i++)
{
struct Data* rating = Ratings + i;
// Increment movie stats
Movies[rating->MovieId].RatingCount++;
Movies[rating->MovieId].RatingSum += rating->Rating;
// killed map translate stuff here -- sdw
Customers[rating->CustId].RatingCount++;
Customers[rating->CustId].RatingSum += rating->Rating;
}
// Do a follow-up loop to calc movie averages
for (i=0; i<num_movies; i++)
{
struct Movie* movie = Movies+i;
movie->RatingAvg = movie->RatingSum / (1.0 * movie->RatingCount);
movie->PseudoAvg = (3.23 * 25 + movie->RatingSum) / (25.0 + movie->RatingCount);
}
}
//
// CalcFeatures
// - Iteratively train each feature on the entire data set
// - Once sufficient progress has been made, move on
//
void CalcFeatures()
{
int f, e, i, custId, cnt = 0;
struct Data* rating;
double err, p, sq, rmse_last, rmse = 2.0;
short movieId;
float cf, mf;
for (f=0; f<MAX_FEATURES; f++)
{
wprintf(L"\n--- Calculating feature: %d ---\n", f);
// Keep looping until you have passed a minimum number
// of epochs or have stopped making significant progress
for (e=0; (e < MIN_EPOCHS) || (rmse <= rmse_last - MIN_IMPROVEMENT); e++)
{
cnt++;
sq = 0;
rmse_last = rmse;
for (i=0; i<RatingCount; i++)
{
rating = Ratings + i;
movieId = rating->MovieId;
custId = rating->CustId;
// Predict rating and calc error
p = PredictRating2(movieId, custId, f, rating->Cache, true);
// wprintf(L" movieId=%d custId=%d rating=%d predicted=%1.2f finalpredicted=%1.2f\n", movieId, custId, rating->Rating, p, PredictRating(movieId, custId));
err = (1.0 * rating->Rating - p);
sq += err*err;
// Cache off old feature values
cf = CustFeatures[f][custId];
mf = MovieFeatures[f][movieId];
// Cross-train the features
CustFeatures[f][custId] += (float)(LRATE * (err * mf - K * cf));
MovieFeatures[f][movieId] += (float)(LRATE * (err * cf - K * mf));
}
rmse = sqrt(sq/RatingCount);
// wprintf(L" cnt='%d' rmse='%f' />\n",cnt,rmse);
}
// Cache off old predictions
for (i=0; i<RatingCount; i++)
{
rating = Ratings + i;
rating->Cache = (float)PredictRating2(rating->MovieId, rating->CustId, f, rating->Cache, false);
}
}
}
//
// PredictRating
// - During training there is no need to loop through all of the features
// - Use a cache for the leading features and do a quick calculation for the trailing
// - The trailing can be optionally removed when calculating a new cache value
//
double PredictRating2(short movieId, int custId, int feature, float cache, bool bTrailing)
{
// Get cached value for old features or default to an average
double sum = (cache > 0) ? cache : 1; //Movies[movieId].PseudoAvg;
// Add contribution of current feature
sum += MovieFeatures[feature][movieId] * CustFeatures[feature][custId];
if (sum > 5) sum = 5;
if (sum < 1) sum = 1;
// Add up trailing defaults values
if (bTrailing)
// if (bTrailing && feature > 0) // sdw -- no way to test this without more complicated data so leaving it alone for now
{
sum += (MAX_FEATURES-feature-1) * (INITx * INITx); // sdw -- this is bizarre; wrap around from the other side minus one?
// sum += (feature-1) * (INITx * INITx); // sdw -- wouldn't this be more correct? meh?
if (sum > 5) sum = 5;
if (sum < 1) sum = 1;
}
return sum;
}
//
// PredictRating
// - This version is used for calculating the final results
// - It loops through the entire list of finished features
//
double PredictRating(short movieId, int custId)
{
double sum = 1; //Movies[movieId].PseudoAvg;
int f;
for (f=0; f<MAX_FEATURES; f++)
{
sum += MovieFeatures[f][movieId] * CustFeatures[f][custId];
}
if (sum > 5) sum = 5;
if (sum < 1) sum = 1;
return sum;
}
/*
###############################################################################
#
# SVD Sample Code
#
# Copyright (C) 2007 Timely Development (www.timelydevelopment.com)
#
# Special thanks to Simon Funk and others from the Netflix Prize contest
# for providing pseudo-code and tuning hints.
#
# Feel free to use this code as you wish as long as you include
# these notices and attribution.
#
# Also, if you have alternative types of algorithms for accomplishing
# the same goal and would like to contribute, please share them as well :)
#
# STANDARD DISCLAIMER:
#
# - THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY
# - OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT
# - LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND#OR
# - FITNESS FOR A PARTICULAR PURPOSE.
#
###############################################################################
Copyright © 2008 by Timely Development, LLC.
*/
MODULE = Math::Preference::SVD PACKAGE = Math::Preference::SVD
INCLUDE: const-xs.inc
PROTOTYPES: DISABLE
void
set_Movies (movieId, count, sum)
int movieId
int count
int sum
PREINIT:
I32* temp;
PPCODE:
temp = PL_markstack_ptr++;
set_Movies(movieId, count, sum);
if (PL_markstack_ptr != temp) {
/* truly void, because dXSARGS not invoked */
PL_markstack_ptr = temp;
XSRETURN_EMPTY; /* return empty stack */
}
/* must have used dXSARGS; list context implied */
return; /* assume stack size is correct */
void
set_Ratings (movieId, custId, rating)
int movieId
int custId
int rating
PREINIT:
I32* temp;
PPCODE:
temp = PL_markstack_ptr++;
set_Ratings(movieId, custId, rating);
if (PL_markstack_ptr != temp) {
/* truly void, because dXSARGS not invoked */
PL_markstack_ptr = temp;
XSRETURN_EMPTY; /* return empty stack */
}
/* must have used dXSARGS; list context implied */
return; /* assume stack size is correct */
void
Engine (x_num_customers, x_num_ratings, x_num_movies)
int x_num_customers
int x_num_ratings
int x_num_movies
PREINIT:
I32* temp;
PPCODE:
temp = PL_markstack_ptr++;
Engine(x_num_customers, x_num_ratings, x_num_movies);
if (PL_markstack_ptr != temp) {
/* truly void, because dXSARGS not invoked */
PL_markstack_ptr = temp;
XSRETURN_EMPTY; /* return empty stack */
}
/* must have used dXSARGS; list context implied */
return; /* assume stack size is correct */
void
DestroyEngine ()
PREINIT:
I32* temp;
PPCODE:
temp = PL_markstack_ptr++;
DestroyEngine();
if (PL_markstack_ptr != temp) {
/* truly void, because dXSARGS not invoked */
PL_markstack_ptr = temp;
XSRETURN_EMPTY; /* return empty stack */
}
/* must have used dXSARGS; list context implied */
return; /* assume stack size is correct */
void
CalcMetrics ()
PREINIT:
I32* temp;
PPCODE:
temp = PL_markstack_ptr++;
CalcMetrics();
if (PL_markstack_ptr != temp) {
/* truly void, because dXSARGS not invoked */
PL_markstack_ptr = temp;
XSRETURN_EMPTY; /* return empty stack */
}
/* must have used dXSARGS; list context implied */
return; /* assume stack size is correct */
void
CalcFeatures ()
PREINIT:
I32* temp;
PPCODE:
temp = PL_markstack_ptr++;
CalcFeatures();
if (PL_markstack_ptr != temp) {
/* truly void, because dXSARGS not invoked */
PL_markstack_ptr = temp;
XSRETURN_EMPTY; /* return empty stack */
}
/* must have used dXSARGS; list context implied */
return; /* assume stack size is correct */
double
PredictRating2 (movieId, custId, feature, cache, bTrailing)
short movieId
int custId
int feature
float cache
bool bTrailing
double
PredictRating (movieId, custId)
short movieId
int custId