📄 data preprocessing for tc.sql
字号:
--1---------import from text to table reuter21587
--2---------step1:build dictionary using term extraction of SSIS(sql server integrated service)
select * from dic_test
--3--------- create dataset from reuter21587, add doc_id
select identity(int,1,1) as doc_id, topic, [text] into Dataset from reuters21578
select * from dataset
--4--------- using SSIS build Term Vector by using table dic(after stemming) or table dic_test(before stemming)
------------ destination table is vector(926,388) and vector_after_stemed_dic(694054)
select count(*) from vector_from_stemed_dic
--5---------stem dic
select count(*) from dataset
where [text] like '%accountant%'
select * from dic_test
where term like '%accountant%'
----------- stemer normalize dictionary into table dic
drop table #dic
drop table dic
select distinct dbo.fnporteralgorithm(term) as term into #dic
from dic_test
--where score>5 ---can be different
order by term
select identity(int,1,1) as id, term into dic from #dic
order by id
--6-------stem vector (with index on term and id, 7'19'' too much!)without index(8'32'') build a new table vector_stem
select doc_id, dbo.fnporteralgorithm(term) as term, frequency as tf, topic as class
into vector_stem1 from vector
order by doc_id
--7-----build term vecter space from vector (merge the same term and sum the tf after stemming)
----remember change topic(ntext) to nvchar(max)
--drop table vector1
select * from vector_stem1
where doc_id=1 and term like '%doubt%'
order by term
select * from vector1
where doc_id=1 and term like '%doubt%'
order by term
select doc_id, class as class, term, sum(tf) as tf
into vector1 from vector_stem1
group by doc_id, class, term
order by doc_id, class, term
---check the result(records of vector_stem have been reduced from 926,338 to 874,151)
select count(*) from vector1
select count(*) from vector_stem1
------added term word_id from table dic1 to term vector space.
--drop table vector2
select t.doc_id, t.class, d.id as wordid, t.term, t.tf into vector2
from vector1 t, dic d
where t.term=d.term
order by t.doc_id
select count(*) from vector2
where doc_id=4 --and term like '%conver%'
order by term
------check the wrong stemer result of vector_stem, may caused by double stem
select * from vector2--_stem1
where doc_id=4 and term like '%conver%'
order by term
select * from vector_stem
where doc_id=4 and term like '%conver%'
order by term
----calculate document frequency......
drop table dic_freq
select wordid, term, count(tf)as doc_frq into dic_freq
from vector2
group by wordid, term
order by wordid, term
select * from dic where term not in(
select term from dic_freq)
----add doc_frq into term_vector_space table vecter3
select t.doc_id, t.class, t.wordid, t.term, t.tf, d.doc_frq into Vector3
from vector2 t, dic_freq d
where t.wordid=d.wordid
-------20 words do not appear in the vecotr3 and being deleted from dic
select * from vector3
where term in(
select term from dic where term not in (select term from dic_freq))
---calculate tf_idf......
select *, (1+log10(tf))*log10(21587/doc_frq) as tf_idf into vector4 from vector3
select * from vector4
order by tf_idf desc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -