NAME
QDBM_File - Tied access to Quick Database Manager
SYNOPSIS
use QDBM_File;
# hash db
[$db =] tie %hash, "QDBM_File", $filename, [$flags, $mode, $buckets];
[$db =] tie %hash, "QDBM_File::Multiple",
$filename, [$flags, $mode, $buckets, $dirs];
$hash{"abc"} = "1234";
$value = $hash{"abc"};
untie %hash;
$db = QDBM_File->new($filename, [$flags, $mode, $buckets]);
$db = QDBM_File::Multiple->new(
$filename, [$flags, $mode, $buckets, $dirs]
);
$db->STORE("abc", "1234");
$value = $db->FETCH("abc");
undef $db;
# b+ tree
# $compare_sub example: sub { $_[0] cmp $_[1] }
[$db =] tie %hash, "QDBM_File::BTree",
$filename, [$flags, $mode, $compare_sub];
[$db =] tie %hash, "QDBM_File::BTree::Multiple",
$filename, [$flags, $mode, $compare_sub];
$db = QDBM_File::BTree->new($filename, [$flags, $mode, $compare_sub]);
$db = QDBM_File::BTree::Multiple->new(
$filename, [$flags, $mode, $compare_sub]
);
# inverted index
$db = QDBM_File::InvertedIndex->new($filename, [$flags]);
# hash db, btree, inverted index common api
$num = $db->get_size();
$name = $db->get_name();
$num = $db->get_mtime();
$bool = $db->sync();
$bool = $db->optimize([$buckets]);
$bool = $db->init_iterator();
$bool = $db->is_writable();
$bool = $db->is_fatal_error();
$msg = $class->get_error();
# hash db, btree common api
$bool = $db->STORE($key, $value);
$bool = $db->store_keep($key, $value);
$bool = $db->store_cat($key, $value);
$num = $db->get_record_size($key);
$num = $db->count_records();
$bool = $class->repair($filename);
$bool = $db->export_db($filename);
$bool = $db->import_db($filename);
# hash db only
$value = $db->FETCH($key, [$start, $offset]);
$bool = $db->set_align($align);
$bool = $db->set_fbp_size($size);
$num = $db->count_buckets();
$num = $db->count_used_buckets();
# Large Object: QDBM_File::Multiple only
$bool = $db->store_lob($key, $value);
$bool = $db->store_keep_lob($key, $value);
$bool = $db->store_cat_lob($key, $value);
$value = $db->fetch_lob($key);
$bool = $db->delete_lob($key);
$bool = $db->exists_lob($key);
$num = $db->count_lob_records();
# btree only
$bool = $db->store_dup($key, $value);
$bool = $db->store_dupr($key, $value);
$bool = $db->store_list($key, @values);
@values = $db->fetch_list($key);
$bool = $db->delete_list($key);
$num = $db->count_match_records($key);
$num = $db->count_leafs();
$num = $db->count_non_leafs();
$bool = $db->move_first();
$bool = $db->move_last();
$bool = $db->move_next();
$bool = $db->move_prev();
$bool = $db->move_forward($key);
$bool = $db->move_backword($key);
$key = $db->get_current_key();
$value = $db->get_current_value();
$bool = $db->store_current($value);
$bool = $db->store_after($value);
$bool = $db->store_before($value);
$bool = $db->delete_current($value);
$bool = $db->begin_transaction();
$bool = $db->commit();
$bool = $db->rollback();
$db->set_tuning(
$max_leaf_record,
$max_non_leaf_index,
$max_cache_leaf,
$max_cache_non_leaf
);
# DBM_Filter
$old_filter = $db->filter_store_key ( sub { ... } );
$old_filter = $db->filter_store_value( sub { ... } );
$old_filter = $db->filter_fetch_key ( sub { ... } );
$old_filter = $db->filter_fetch_value( sub { ... } );
# inverted index api
$doc = QDBM_File::InvertedIndex->create_document($uri);
$bool = $db->store_document($doc, [$max_words, $is_overwrite]);
$doc = $db->get_document_by_uri($uri);
$doc = $db->get_document_by_id($id);
$id = $db->get_document_id($uri);
$bool = $db->delete_document_by_uri($uri);
$bool = $db->delete_document_by_id($id);
$bool = $db->exists_document_by_uri($uri);
$bool = $db->exists_document_by_id($id);
$doc = $db->get_next_document();
@id = $db->search_document($word, [$max]);
$num = $db->search_document_count($word);
$bool = QDBM_File::InvertedIndex->merge($filename, @filenames);
%score = $db->get_scores($doc, $max);
QDBM_File::InvertedIndex->set_tuning(
$index_buckets,
$inverted_index_division_num,
$dirty_buffer_buckets,
$dirty_buffer_size
);
$db->set_char_class($space, $delimiter, $glue);
@appearance_words = $db->analyze_text($text);
@appearance_words = QDBM_File::InvertedIndex->analyze_text($text);
$normalized_word = QDBM_File::InvertedIndex->normalize_word($word);
@id = $db->query($query);
# document api
$doc = QDBM_File::InvertedIndex::Document->new($uri);
$doc->set_attribute($name, $value);
$value = $doc->get_attribute($name);
$doc->add_word($normalized_word, $appearance_word);
$uri = $doc->get_uri();
$id = $doc->get_id();
@normalized_words = $doc->get_normalized_words();
@appearance_words = $doc->get_appearance_words();
%score = $doc->get_scores($max, [$db]);
DESCRIPTION
QDBM_File is a module which allows Perl programs to make use of the facilities provided by the qdbm library. If you use this module, you should read the qdbm manual pages.
Quick Database Manager is a high performance dbm library maintained by Mikio Hirabayashi. QDBM_File provides various API, Depot, Curia, Villa, Vista and Odeum. Documents are available at http://qdbm.sourceforge.net/
HASH DATABASE
Hash database is basic file format of qdbm. It is equivalent to other dbm modules functionality.
Example
This is a example of hash database.
use Fcntl;
use QDBM_File;
my %hash;
my $filename = "mydata";
tie %hash, "QDBM_File", $filename, O_RDWR|O_CREAT, 0644 or die $!;
$hash{"key"} = "value"; # store value
my $value = $hash{"key"}; # fetch value
untie %hash # close database
Methods
- TIEHASH
-
tie %hash, "QDBM_File", $filename, $flags, $mode, $buckets;
Tie interface is similar to other dbm modules. Optional
$flags
is opening flags importable from Fcntl, $mode is file permission.O_CREAT|O_RDWR
,0644
are used if omitted.$buckets
specifies number of elements of the bucket array. If omitted,-1
is used (qdbm default).tie %hash, "QDBM_File", "mydata", O_CREAT|O_RDWR, 0644, -1; tie %hash, "QDBM_File", "mydata"; # equivalent
- QDBM_File->new($filename, [$flags, $mode, $buckets])
-
OOP constructor of QDBM_File. Arguments are equivalent to tie interface.
$db = QDBM_File->new("mydata", O_CREAT|O_RDWR, 0644, -1);
- $db->STORE($key, $value)
-
Store value to the database. Existing value is overwritten.
tie %hash, "QDBM_File", "mydata"; $hash{"abc"} = "1234"; # tied interface $db->STORE("abc", "1234"); # OOP interface
- $db->store_keep($key, $value)
-
Similar to
STORE()
, existing value is kept.$db->store_keep("abc", "1234"); $db->store_keep("abc", "5678"); # value is still 1234
- $db->store_cat($key, $value)
-
Similar to
STORE()
, existing value is concatenated.$db->store_cat("abc", "1234"); $db->store_cat("abc", "5678"); # value is 12345678
- $db->FETCH($key, [$start, $offset])
-
Fetch value from the database. It has optional arguments
$start
,$offset
.$start
specifies the start position to be read,$offset
specifies the max size to be read.$hash{"abc"} = "defg"; $value = $hash{"abc"}; $value = $db->FETCH("abc"); $value = $db->FETCH("abc", 1, 2); # "ef"
- $db->get_size()
-
Get file size of the database.
- $db->get_name()
-
Get name of the database.
- $db->get_mtime()
-
Get modified time of the database.
- $db->get_record_size($key)
-
Get size of the value.
- $db->count_records()
-
Get number of records of the database.
- $db->sync()
-
Write buffers immediately.
- $db->is_writable()
-
Return true if database is writable.
- QDBM_File->get_error()
-
Get last error message.
- $db->is_fatal_error()
-
Return true if database has a fatal error.
- $db->init_iterator()
-
Initialize iterator for
keys()
,values()
,each()
. - $db->optimize([$buckets])
-
Optimize the database file.
$buckets
is number of elements of the bucket array. Default is-1
(qdbm default). - QDBM_File->repair($filename)
-
Repair broken database file.
- $db->export_db($filename)
-
Export database as endian independent file.
- $db->import_db($filename)
-
Import file exported by
export_db()
. - $db->set_align($align)
-
Set size of database alignment.
- $db->set_fbp_size($size)
-
Set size of free block pool. Default is 16.
- $db->count_buckets()
-
Get number of elements of the bucket array.
- $db->count_used_buckets()
-
Get number of elements of the used bucket array.
MULTIPLE DIRECTORY DATABASE
QDBM_File::Multiple is extended hash database. Database files are stored in multiple directories. API is the same as QDBM_File.
QDBM_File::Multiple also provides large object managing API. Large object record is stored in individual files.
Methods
- TIEHASH
-
tie %hash, "QDBM_File::Multiple", $filename, $flags, $mode, $buckets, $dirs;
QDBM_File::Multiple has optional argument
$dirs
, specifies division number of directory. Default is-1
(qdbm default). - $db->store_lob($key, $value)
-
Store value to the database. Record is stored in individual files.
- $db->store_keep_lob($key, $value)
-
Similar to
store_lob()
, existing value is kept. - $db->store_cat_lob($key, $value)
-
Similar to
store_lob()
, existing value is concatenated. - $db->fetch_lob($key)
-
Fetch the large object from the database.
- $db->delete_lob($key)
-
Delete the large object record.
- $db->exists_lob($key)
-
Return true if the large object record exists.
- $db->count_lob_records()
-
Number of large object records of the database.
BTREE DATABASE
QDBM_File::BTree allows to store data in sorted. It is possible to compare keys by user defined subroutine.
Example
Thie is a example of b+ tree database.
use Fcntl;
use QDBM_File;
my $filename = "mydata";
my $compare = sub { $_[0] cmp $_[1] };
my %hash;
my $db = tie %hash, "QDBM_File::BTree",
$filename, O_RDWR|O_CREAT, 0640, $compare or die $!;
$hash{"def"} = "DEF";
$hash{"abc"} = "ABC";
$hash{"ghi"} = "GHI";
print join " ", keys %hash; # abc def ghi
Methods
- TIEHASH
-
tie %hash, "QDBM_File::BTree", $filename, $flags, $mode, $compare_sub;
QDBM_File::BTree has optional argument
$compare_sub
, used for key comparison, must return-1
,0
or1
. By default, lexical order is used.sub { $_[0] cmp $_[1] } # lexical order sub { $_[0] <=> $_[1] } # numerical order
- $db->store_list($key, @values)
-
Store values as list.
- $db->store_dup($key, $value)
-
Similar to
STORE()
, duplication of keys is allowed and the specified value is added as the last one. - $db->store_dupr($key, $value)
-
Similar to
STORE()
, duplication of keys is allowed and the specified value is added as the first one. - $db->fetch_list($key)
-
Fetch values as list.
- $db->delete_list($key)
-
Delete all records corresponding a key.
- $db->count_match_records($key)
-
Get number of records corresponding a key.
- $db->count_leafs()
-
Get number of the leaf nodes of b+ tree.
- $db->count_non_leafs()
-
Get number of the non-leaf nodes of b+ tree.
- $db->move_first()
-
Move the cursor to the first record.
- $db->move_last()
-
Move the cursor to the last record.
- $db->move_next()
-
Move the cursor to the next record.
- $db->move_prev()
-
Move the cursor to the previous record.
- $db->move_forward($key)
-
Set cursor to the first record of the same key and that the cursor is set to the next substitute if completely matching record does not exist.
- $db->move_backword($key)
-
Set cursor to the last record of the same key and that the cursor is set to the previous substitute if completely matching record does not exist.
- $db->get_current_key()
-
Get key of the record where the cursor is.
- $db->get_current_value()
-
Get value of the record where the cursor is.
- $db->store_current($value)
-
Overwrite the current record.
- $db->store_after($value)
-
Insert record after the current record.
- $db->store_before($value)
-
Insert record before the current record.
- $db->delete_current($value)
-
Delete the record where the cursor is.
- $db->begin_transaction()
-
Order to begin the transaction.
- $db->commit()
-
Order to commit the transaction.
- $db->rollback()
-
Order to abort the transaction.
- $db->set_tuning
-
$db->set_tuning( $max_leaf_record, $max_non_leaf_index, $max_cache_leaf, $max_cache_non_leaf );
Set the tuning parameters for performance.
$max_leaf_record
specifies the max number of records in a leaf node of b+ tree.$max_non_leaf_index
specifies the max number of indexes in a non-leaf node of b+ tree.$max_cache_leaf
specifies the max number of caching leaf nodes.$max_cache_non_leaf
specifies the max number of caching non-leaf nodes. The default setting is equivalent to(49, 192, 1024, 512)
. Because tuning parameters are not saved in a database, you should specify them every opening a database.
MULTIPLE DIRECTORY BTREE DATABASE
QDBM_File::BTree::Multiple is multiple directory version of QDBM_File::BTree. API is the same as QDBM_File::BTree.
INVERTED INDEX
QDBM_File::InvertedIndex provides inverted index API. Inverted index is a data structure to retrieve a list of some documents that include one of words which were extracted from a population of documents. See http://qdbm.sourceforge.net/spex.html#odeumapi for more details.
Example
This is a example of QDBM_File::InvertedIndex.
use Fcntl;
use QDBM_File;
my $filename = "mydata";
my $db = QDBM_File::InvertedIndex->new($filename, O_RDWR|O_CREAT) or die $!;
my $uri = "http://www.perl.com/";
my $doc = QDBM_File::InvertedIndex->create_document($uri);
my @words = QDBM_File::InvertedIndex->analyze_text(
"There is more than one way to do it."
);
for my $word (@words) {
my $normal = QDBM_File::InvertedIndex->normalize_word($word);
$doc->add_word($normal, $word);
}
$db->store_document($doc);
my @id = $db->search_document("way");
my $doc2 = $db->get_document_by_id($id[0]);
print $doc2->get_uri(); # http://www.perl.com/
Methods
- QDBM_File::InvertedIndex->new($filename, [$flags])
-
Constructor of QDBM_File::InvertedIndex.
- QDBM_File::InvertedIndex->create_document($uri)
-
Create QDBM_File::InvertedIndex::Document object.
$uri
specifies the URI of a document. The id number of a new document is not defined. It is defined when the document is stored in a database. - $db->store_document($doc, [$max_words, $is_overwrite])
-
Store document to the database.
$max_words
specifies the max number of words to be stored in the document database. Default is-1
(unlimited).$is_overwrite
specifies whether the data of the duplicated document is overwritten or not. - $db->get_document_by_uri($uri)
-
Retrieve a document by a uri.
- $db->get_document_by_id($id)
-
Retrieve a document by an id.
- $db->get_document_id($uri)
-
Get id of uri.
- $db->delete_document_by_uri($uri)
-
Delete a document by a uri.
- $db->delete_document_by_id($id)
-
Delete a document by an id.
- $db->exists_document_by_uri($uri)
-
Check whether the document by a uri exists.
- $db->exists_document_by_id($id)
-
Check whether the document by an id exists.
- $db->get_next_document()
-
Get the next document.
- $db->search_document($word)
-
Search the inverted index for documents including a particular word. Return values are array of id.
- $db->search_document_count($word)
-
Get number of documents including a word.
- QDBM_File::InvertedIndex->merge($filename, @filenames)
-
Merge plural database directories.
- $db->get_scores($doc, $max)
-
Get keywords of document in normalized form and their scores.
$max
specifies the max number of keywords to get. - $db->set_tuning
-
$db->set_tuning( $index_buckets, $inverted_index_division_num, $dirty_buffer_buckets, $dirty_buffer_size );
Set the global tuning parameters.
$index_buckets
specifies the number of buckets for inverted indexes.$inverted_index_division_num
specifies the division number of inverted index.$dirty_buffer_buckets
specifies the number of buckets for dirty buffers.$dirty_buffer_size
specifies the maximum bytes to use memory for dirty buffers. The default setting is equivalent to(32749, 7, 262139, 8388608)
. This method should be called before opening a database. - $db->set_char_class($space, $delimiter, $glue)
-
Set the classes of characters used by
analyze_text()
.$space
spacifies a string contains space characters.$delimiter
spacifies a string contains delimiter characters.$glue
spacifies a string contains glue characters. - $db->analyze_text($text)
-
Break a text into words and return appearance forms and normalized form into lists.
- QDBM_File::InvertedIndex->analyze_text($text)
-
Break a text into words in appearance form. Words are separated with space characters and such delimiters as period, comma and so on.
- QDBM_File::InvertedIndex->normalize_word($word)
-
Get normalized form of a word.
- $db->query($query)
-
Query a database using a small boolean query language. Return values are list of id.
@doc_id = $db->query("There | more"); # "There" || "more" @doc_id = $db->query("There & foo"); # "There" && "foo" @doc_id = $db->query("There ! foo"); # "There" && !"foo" @doc_id = $db->query("There & (more | foo)");
Document Methods
- QDBM_File::InvertedIndex::Document->new($uri)
-
Create QDBM_File::InvertedIndex::Document object.
$uri
specifies the uri of a document. The id number of a new document is not defined. It is defined when the document is stored in a database. - $doc->set_attribute($name, $value)
-
Add an attribute to the document.
- $doc->get_attribute($name)
-
Get an attribute of the document.
- $doc->add_word($normalized_word, $appearance_word)
-
Add a word to the document.
$normalized_word
specifies the string of the normalized form of a word. Normalized forms are treated as keys of the inverted index.$appearance_word
specifies the string of the appearance form of the word. - $doc->get_uri()
-
Get uri of the document.
- $doc->get_id()
-
Get id of the document.
- $doc->get_normalized_words()
-
Return words of the document in normalized form.
- $doc->get_appearance_words()
-
Get words of the document in appearance form.
- $doc->get_scores($max, [$db])
-
Get keywords of document in normalized form and their scores.
$max
specifies the max number of keywords to get.$db
specifies QDBM_File::InvertedIndex object with which the IDF for weighting is calculate.
AUTHOR
Toshiyuki Yamato, <toshiyuki.yamato@gmail.com>
BUGS AND WARNINGS
Currently umask flags is ignored implicitly, 0644
is always used. It is used for other dbm modules compatibility.
SEE ALSO
COPYRIGHT & LICENSE
Copyright 2007-2008 Toshiyuki Yamato, all rights reserved.
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.