Skip to content

Commit aed59b1

Browse files
committed
ENH: Adds PostHistory, Comments and Tags to stackoverflow database
1 parent f40d5db commit aed59b1

File tree

5 files changed

+65
-6
lines changed

5 files changed

+65
-6
lines changed

load_into_pg.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
7474
if pre != '':
7575
cur.execute(pre)
7676
conn.commit()
77-
print 'Pre-processing took {} seconds'.format(time.time() - start_time)
77+
print 'Pre-processing took {:.1f} seconds'.format(time.time() - start_time)
7878

7979
# Handle content of the table
8080
start_time = time.time()
@@ -91,7 +91,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
9191
' VALUES\n' + valuesStr + ';'
9292
cur.execute(cmd)
9393
conn.commit()
94-
print 'Table processing took {} seconds'.format(time.time() - start_time)
94+
print 'Table processing took {:.1f} seconds'.format(time.time() - start_time)
9595

9696
# Post-processing (creation of indexes)
9797
start_time = time.time()
@@ -119,7 +119,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
119119
parser = argparse.ArgumentParser()
120120
parser.add_argument( 'table'
121121
, help = 'The table to work on.'
122-
, choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes']
122+
, choices = ['Users', 'Badges', 'Posts', 'Tags', 'Votes', 'PostHistory', 'Comments']
123123
)
124124

125125
parser.add_argument( '-d', '--dbname'
@@ -134,12 +134,12 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
134134

135135
parser.add_argument( '-u', '--username'
136136
, help = 'Username for the database.'
137-
, default = None
137+
, default = 'postgres'
138138
)
139139

140140
parser.add_argument( '-p', '--password'
141141
, help = 'Password for the database.'
142-
, default = None
142+
, default = 'fibinse'
143143
)
144144

145145
parser.add_argument( '-P', '--port'
@@ -232,7 +232,25 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
232232
, 'ExcerptPostId'
233233
, 'WikiPostId'
234234
]
235-
235+
elif table == 'PostHistory':
236+
keys = [
237+
'Id',
238+
'PostHistoryTypeId',
239+
'PostId',
240+
'RevisionGUID',
241+
'CreationDate',
242+
'UserId',
243+
'Text'
244+
]
245+
elif table == 'Comments':
246+
keys = [
247+
'Id',
248+
'PostId',
249+
'Score',
250+
'Text',
251+
'CreationDate',
252+
'UserId',
253+
]
236254
choice = raw_input('This will drop the {} table. Are you sure [y/n]?'.format(table))
237255

238256
if len(choice) > 0 and choice[0].lower() == 'y':

sql/Comments_post.sql

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
-- hash index takes too long to create
2+
CREATE INDEX cmnts_post_type_id_idx ON Comments USING btree (Score)
3+
WITH (FILLFACTOR = 100);
4+
CREATE INDEX cmnts_postid_idx ON Comments USING hash (PostId)
5+
WITH (FILLFACTOR = 100);
6+
CREATE INDEX cmnts_revguid_idx ON Comments USING btree (RevisionGUID)
7+
WITH (FILLFACTOR = 100);
8+
CREATE INDEX cmnts_creation_date_idx ON Comments USING btree (CreationDate)
9+
WITH (FILLFACTOR = 100);
10+
CREATE INDEX cmnts_userid_idx ON Comments USING btree (UserId)
11+
WITH (FILLFACTOR = 100);

sql/Comments_pre.sql

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
DROP TABLE IF EXISTS Tags CASCADE;
2+
CREATE TABLE Comments (
3+
Id int PRIMARY KEY ,
4+
PostId int,
5+
Score int,
6+
Post_Text text,
7+
CreationDate timestamp not NULL ,
8+
UserId int
9+
);

sql/PostHistory_post.sql

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
-- hash index takes too long to create
2+
CREATE INDEX ph_post_type_id_idx ON PostHistory USING btree (PostHistoryTypeId)
3+
WITH (FILLFACTOR = 100);
4+
CREATE INDEX ph_postid_idx ON PostHistory USING hash (PostId)
5+
WITH (FILLFACTOR = 100);
6+
CREATE INDEX ph_revguid_idx ON PostHistory USING btree (RevisionGUID)
7+
WITH (FILLFACTOR = 100);
8+
CREATE INDEX ph_creation_date_idx ON PostHistory USING btree (CreationDate)
9+
WITH (FILLFACTOR = 100);
10+
CREATE INDEX ph_userid_idx ON PostHistory USING btree (UserId)
11+
WITH (FILLFACTOR = 100);

sql/PostHistory_pre.sql

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
DROP TABLE IF EXISTS Tags CASCADE;
2+
CREATE TABLE PostHistory (
3+
Id int PRIMARY KEY ,
4+
PostHistoryTypeId int,
5+
PostId int,
6+
RevisionGUID text,
7+
CreationDate timestamp not NULL ,
8+
UserId int,
9+
PostText text
10+
);

0 commit comments

Comments
 (0)