movielens

package
v0.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 30, 2022 License: AGPL-3.0 Imports: 19 Imported by: 0

README

MovieLens Example

Original Data: MovieLens 100k

SQLite DB file: movielens.db.zip

To run the tests, you need download the SQLite DB file and put it in the current directory.

# download and unzip the SQLite DB file
wget https://github.com/auxten/edgeRec/files/9895974/movielens.db.zip && unzip movielens.db.zip

SQL that split training set and test set by 80% and 20% user:


-- import data from csv, do it with any tool

select count(distinct userId) from ratings; -- 610 users

create table user as select distinct userId, 0 as is_train  from ratings;

-- choose 80% random user as train user
update user
set is_train = 1
where userId in
      (SELECT userId
       FROM (select distinct userId from ratings)
       ORDER BY RANDOM()
       LIMIT 488);

select count(*) from user where is_train != 1;

-- split train and test set of movielens ratings
create table ratings_train as
select r.userId, movieId, rating, timestamp
from ratings r
         left join user u on r.userId = u.userId
where is_train = 1;
create table ratings_test as
select r.userId, movieId, rating, timestamp
from ratings r
         left join user u on r.userId = u.userId
where is_train = 0;

select count(*) from ratings_train;
select count(*) from ratings_test;
select count(*) from ratings;

select count(distinct movieId) from movies

The DIN way to split dataset

There is another way to split the MovieLens-20m dataset with userId that is described in the Deep Interest Network paper.

MovieLens 20m

Related SQL:

create table movies
(
    movieId INTEGER,
    title   TEXT,
    genres  TEXT
);

create table ratings
(
    userId INTEGER,
    movieId INTEGER,
    rating FLOAT,
    timestamp INTEGER
);

create table tags
(
    userId    INTEGER,
    movieId   INTEGER,
    tag       TEXT,
    timestamp INTEGER
);

-- import data from csv, do it with any tool

select count(distinct userId) from ratings; -- 138,493 users

create table user as select distinct userId, 0 as is_train  from ratings;

-- choose 100000 random user as train user
update user
set is_train = 1
where userId in
      (SELECT userId
       FROM (select distinct userId from ratings)
       ORDER BY RANDOM()
       LIMIT 100000);

select count(*) from user where is_train != 1; -- 38,493 test users

-- split train and test set of movielens-20m ratings
create table ratings_train as
select r.userId, movieId, rating, timestamp
from ratings r
         left join user u on r.userId = u.userId
where is_train = 1;
create table ratings_test as
select r.userId, movieId, rating, timestamp
from ratings r
         left join user u on r.userId = u.userId
where is_train = 0;

select count(*) from ratings_train; --14,393,526
select count(*) from ratings_test;  --5,606,737
select count(*) from ratings;       --20,000,263

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func BinarizeLabel

func BinarizeLabel(rating float64) float64

func PreFillUbCache added in v0.2.0

func PreFillUbCache(ubc *ubcache.UserBehaviorCache, table string) (err error)

PreFillUbCache prefill ubcache with data from db `ub_test` or `ub_train`. the ub_train table is generated by SQL like:

	```sql
	create table ratings_train_desc as
		select r.userId, movieId, rating, timestamp
			from ratings_train r order by r.userId, timestamp desc;

 create table ub_train as
		select userId, group_concat(movieId) movieIds ,group_concat(timestamp) timestamps
			from ratings_train_desc group by userId order by timestamp;
	```

Sample from ub_train will be like:

31699, "246,247,252,260,265", "825638410,825638407,825638403,825638401,825638400"

Types

type MovielensRec added in v0.2.0

type MovielensRec struct {
	DataPath  string
	SampleCnt int
	// contains filtered or unexported fields
}

func (*MovielensRec) GetDashboardOverview added in v0.2.0

func (recSys *MovielensRec) GetDashboardOverview(ctx context.Context) (res rcmd.DashboardOverviewResult, err error)

func (*MovielensRec) GetItemFeature added in v0.2.0

func (recSys *MovielensRec) GetItemFeature(ctx context.Context, itemId int) (tensor rcmd.Tensor, err error)

func (*MovielensRec) GetItemsFeatureOverview added in v0.2.0

func (recSys *MovielensRec) GetItemsFeatureOverview(ctx context.Context, offset, size int, _ map[string][]string) (res rcmd.ItemOverviewResult, err error)

func (*MovielensRec) GetUserBehavior added in v0.2.0

func (recSys *MovielensRec) GetUserBehavior(ctx context.Context, userId int,
	maxLen int64, maxPk int64, maxTs int64) (itemSeq []int, err error)

func (*MovielensRec) GetUserFeature added in v0.2.0

func (recSys *MovielensRec) GetUserFeature(ctx context.Context, userId int) (tensor rcmd.Tensor, err error)

func (*MovielensRec) GetUsersFeatureOverview added in v0.2.0

func (recSys *MovielensRec) GetUsersFeatureOverview(ctx context.Context, offset, size int, _ map[string][]string) (res rcmd.UserItemOverviewResult, err error)

func (*MovielensRec) ItemSeqGenerator added in v0.2.0

func (recSys *MovielensRec) ItemSeqGenerator(ctx context.Context) (ret <-chan string, err error)

func (*MovielensRec) PreRank added in v0.2.0

func (recSys *MovielensRec) PreRank(ctx context.Context) (err error)

PreRank is called before rank, it can be used to prefill ub cache.

func (*MovielensRec) PreTrain added in v0.2.0

func (recSys *MovielensRec) PreTrain(ctx context.Context) (err error)

func (*MovielensRec) SampleGenerator added in v0.2.0

func (recSys *MovielensRec) SampleGenerator(_ context.Context) (ret <-chan rcmd.Sample, err error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL