The document discusses using n-grams for content-based file classification and information retrieval. It proposes representing documents as n-gram profiles to be used with machine learning algorithms like k-nearest neighbors (kNN) and support vector machines (SVM). SQL procedures are provided to generate n-gram data in both horizontal and vertical database formats for efficient storage and querying. Preliminary results show n-grams, including multigrams, can improve classification performance over single n-grams.
1. Lazy Multigram Learning Environment for ACRS SISY 2008 Zoran Popović shoom013[at] gmail.com Institute for Multidisciplinary Research Belgrade University
2.
3.
4.
5.
6.
7.
8.
9. ngram.jar – generating N-grams java ngram.generator.Arff inDir outfile.arff [options] Options: -l <Lmin> = lower rank bound (default=1) -m <Lmax> = upper rank bound (default=10) -i <invf> = inverse frequency threshold (default=0.34) -N <N> = N-gram order (default=3) -D <depth> = biggest number of N-grams (default=4294967295) -w <url> = use database with jdbc url to write data -r <url> = use database with jdbc url to read arff -u = do not use normalized vectors for output EXAMPLE: . Arff.sh . .ut.arff -l 1 -m 500 -N 4 -i 0.5 -D 1048576 (subfolders as category names)
16. Weka – JDBC Horizontal form of data is needed - all attributes in each row ARFF supports data given by sparse vectors (zero values omitted – this also speeds up SVM)
19. SQL and vertical N-gram storage PROFILES: NSHARED NGRAMS: TSHARED
20.
21. SQL and data transformation Data2.sql – cursor function for a query returning horizontal form of data: create or replace function fewcols(p_lmin number, p_lmax number, p_invf number, p_bagid number, p_norm boolean default true) return sys_refcursor is str varchar2(32000); cat varchar2(64) := null; opt sys_refcursor; j number; n number; norm number := 1; cursor C(lmin number, lmax number, invf number, bagid number) is select T.rank-1 rank, TS.count count, TS.bag_id, TS.category, TS.N from tshared TS, (select rownum - (select count(bag_id) from tshared ts0 where ts0.N<T0.N) rank0, rownum rank, N, ngram from (select N, ngram, count from nshared order by N asc, count desc) T0) T where TS.ngram=T.ngram and TS.N=T.N and T.rank0 between lmin and lmax and (select count(bag_id) from tshared TS2 where TS2.ngram=TS.ngram and TS2.N=TS.N)/(select count(distinct bag_id) from tshared) <= invf and TS.bag_id=bagid order by TS.bag_id asc, TS.N asc, T.rank asc; cursor CS(lmin number, lmax number, invf number, bagid number) is select sqrt(sum((T.rank-1)*(T.rank-1))) norm from tshared TS, (select rownum - (select count(bag_id) from tshared ts0 where ts0.N<T0.N) rank0, rownum rank, N, ngram from (select N, ngram, count from nshared order by N asc, count desc) T0) T where TS.ngram=T.ngram and TS.N=T.N and T.rank0 between lmin and lmax and (select count(bag_id) from tshared TS2 where TS2.ngram=TS.ngram and TS2.N=TS.N)/(select count(distinct bag_id) from tshared) <= invf and TS.bag_id=bagid; begin str := to_char(p_bagid)||' bag_id'; j:=p_lmin; if p_norm then open CS(p_lmin,p_lmax,p_invf,p_bagid); fetch CS into norm; if norm=0 then norm:=1; end if; close CS; end if; for i in C(p_lmin,p_lmax,p_invf,p_bagid) loop if cat is null then cat := i.category; end if; if j<i.rank then for n in j .. i.rank-1 loop str := str || ', 0 A' || to_char(n); end loop; end if; str := str || ', ' || to_char(i.count/norm,'9999.99999999') || ' A' || to_char(i.rank); j := i.rank+1; end loop; if cat is null or j<=p_lmax then select distinct category into cat from tshared where bag_id=p_bagid; for n in j .. p_lmax loop str := str || ', 0 A' || to_char(n); end loop; end if; str := str || ', ''' || cat || ''' category'; open opt for 'select ' || str || ' from dual'; return opt; end; /
22. SQL and data transformation Data3.sql – procedure generating table DATA3 in horizontal form: create or replace procedure data(p_lmin number, p_lmax number, p_invf number, p_norm boolean default true) is str varchar2(32000); cat varchar2(64) := null; first boolean := true; p_bagid number; j number; n number; norm number := 1; cursor C(lmin number, lmax number, invf number, bagid number) is select T.rank-1 rank, TS.count count, TS.bag_id, TS.category, TS.N from tshared TS, (select rownum - (select count(bag_id) from tshared ts0 where ts0.N<T0.N) rank0, rownum rank, N, ngram from (select N, ngram, count from nshared order by N asc, count desc) T0) T where TS.ngram=T.ngram and TS.N=T.N and T.rank0 between lmin and lmax and (select count(bag_id) from tshared TS2 where TS2.ngram=TS.ngram and TS2.N=TS.N)/(select count(distinct bag_id) from tshared) <= invf and TS.bag_id=bagid order by TS.bag_id asc, TS.N asc, T.rank asc; cursor CS(lmin number, lmax number, invf number, bagid number) is select sqrt(sum((T.rank-1)*(T.rank-1))) norm from tshared TS, (select rownum - (select count(bag_id) from tshared ts0 where ts0.N<T0.N) rank0, rownum rank, N, ngram from (select N, ngram, count from nshared order by N asc, count desc) T0) T where TS.ngram=T.ngram and TS.N=T.N and T.rank0 between lmin and lmax and (select count(bag_id) from tshared TS2 where TS2.ngram=TS.ngram and TS2.N=TS.N)/(select count(distinct bag_id) from tshared) <= invf and TS.bag_id=bagid; cursor B IS select distinct bag_id from tshared; begin open B; loop fetch B into p_bagid; exit when B%NOTFOUND; str := to_char(p_bagid)||' bag_id'; j:=p_lmin; if p_norm then open CS(p_lmin,p_lmax,p_invf,p_bagid); fetch CS into norm; if norm=0 then norm:=1; end if; close CS; end if; for i in C(p_lmin,p_lmax,p_invf,p_bagid) loop if cat is null then cat := i.category; end if; if j<i.rank then for n in j .. i.rank-1 loop str := str || ', 0 A' || to_char(n); end loop; end if; str := str || ', ' || to_char(i.count/norm,'9999.99999999') || ' A' || to_char(i.rank); j := i.rank+1; end loop; if cat is null or j<=p_lmax then select distinct category into cat from tshared where bag_id=p_bagid; for n in j .. p_lmax loop str := str || ', 0 A' || to_char(n); end loop; end if; str := str || ', ''' || cat || ''' category'; if first then first := false; begin execute immediate 'drop table data3'; exception when others then null; end; execute immediate 'create table data3 as select ' || str || ' from dual'; else execute immediate 'insert into data3 select ' || str || ' from dual'; end if; end loop; commit; end; /