Skip to content

Commit 7a3c09b

Browse files
committed
Split optional post processing tasks out.
1 parent ab33206 commit 7a3c09b

File tree

4 files changed

+72
-54
lines changed

4 files changed

+72
-54
lines changed

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,15 @@ Schema hints are taken from [a post on Meta.StackExchange](http://meta.stackexch
2828
- `psql stackoverflow < ./sql/final_post.sql`
2929
- If you used a different database name, make sure to use that instead of
3030
`stackoverflow` while executing this step.
31+
- For some additional indexes and tables, you can also execute the the following;
32+
- `psql stackoverflow < ./sql/optional_post.sql`
33+
- Again, remember to user the correct database name here, if not `stackoverflow`.
3134

3235
## Caveats and TODOs
3336

3437
- It prepares some indexes and views which may not be necessary for your analysis.
35-
- The `body` field in `Posts` table is NOT populated.
36-
- The `emailhash` field in `Users` table is NOT populated.
38+
- The `Body` field in `Posts` table is NOT populated.
39+
- The `EmailHash` field in `Users` table is NOT populated.
3740
- Some tables (e.g. `PostHistory` and `Comments`) are missing.
3841

3942
### Sept 2011 data dump

sql/Posts_post.sql

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,3 @@ CREATE INDEX posts_favorite_count_idx ON Posts USING btree (FavoriteCount)
1616
CREATE INDEX posts_viewcount_idx ON Posts USING btree (ViewCount)
1717
WITH (FILLFACTOR = 100);
1818

19-
-- Composite indexes (optional)
20-
CREATE INDEX posts_id_post_type_id_idx ON Posts USING btree (Id, PostTypeId)
21-
WITH (FILLFACTOR = 100);
22-
CREATE INDEX posts_id_parent_id_idx ON Posts USING btree (Id, ParentId)
23-
WITH (FILLFACTOR = 100);
24-
CREATE INDEX posts_id_accepted_answers_id_idx ON Posts USING btree (Id, AcceptedAnswerId)
25-
WITH (FILLFACTOR = 100);
26-
CREATE INDEX posts_owner_user_id_creation_date_idx ON Posts USING btree (OwnerUserId, CreationDate)
27-
WITH (FILLFACTOR = 100);

sql/final_post.sql

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -33,30 +33,6 @@ CREATE INDEX posttags_tagId_idx ON PostTags USING btree (TagId)
3333
WITH (FILLFACTOR = 100);
3434

3535

36-
-- UserTagQA TABLE
37-
DROP TABLE IF EXISTS UserTagQA;
38-
CREATE TABLE UserTagQA (
39-
UserId int,
40-
TagId int,
41-
Questions int,
42-
Answers int,
43-
PRIMARY KEY (UserId, TagId)
44-
);
45-
INSERT INTO UserTagQA
46-
( SELECT P.ownerUserId AS UserId,
47-
PT.tagId AS TagId,
48-
sum(CASE P.PostTypeId WHEN 1 THEN 1 ELSE 0 END) AS Questions,
49-
sum(CASE P.PostTypeId WHEN 2 THEN 1 ELSE 0 END) AS Answers
50-
FROM Posts P JOIN PostTags PT ON PT.PostId = P.Id
51-
WHERE P.OwnerUserId IS NOT NULL
52-
GROUP BY P.OwnerUserId, PT.TagId
53-
);
54-
CREATE INDEX usertagqa_questions_idx ON UserTagQA USING btree (Questions)
55-
WITH (FILLFACTOR = 100);
56-
CREATE INDEX usertagqa_answers_idx ON UserTagQA USING btree (Answers)
57-
WITH (FILLFACTOR = 100);
58-
59-
6036
-- Tables containing static values
6137

6238
-- CloseAsOffTopicReasonTypes TABLE
@@ -250,23 +226,4 @@ INSERT INTO PostLinkTypes VALUES
250226
( 1, 'Linked' ),
251227
( 3, 'Duplicate' );
252228

253-
-- Questions VIEW
254-
DROP VIEW IF EXISTS Questions;
255-
CREATE VIEW Questions AS
256-
SELECT Id, AcceptedAnswerId, CreationDate, Score, ViewCount, OwnerUserId,
257-
LastEditorUserId, LastEditorDisplayName, LastEditDate,
258-
LastActivityDate, Title, Tags, AnswerCount, CommentCount,
259-
FavoriteCount, CommunityOwnedDate
260-
FROM Posts
261-
WHERE PostTypeId = 1;
262-
263-
-- Answers VIEW
264-
DROP VIEW IF EXISTS Answers;
265-
CREATE VIEW Answers AS
266-
SELECT Id, ParentId, CreationDate, Score, OwnerUserId, LastEditorUserId,
267-
LastEditorDisplayName, LastEditDate, LastActivityDate,
268-
CommentCount, CommunityOwnedDate
269-
FROM Posts
270-
WHERE PostTypeId = 2;
271-
272229

sql/optional_post.sql

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
-- These are the optional post processing tasks which may be performed.
2+
3+
-- UserTagQA TABLE
4+
DROP TABLE IF EXISTS UserTagQA;
5+
CREATE TABLE UserTagQA (
6+
UserId int,
7+
TagId int,
8+
Questions int,
9+
Answers int,
10+
PRIMARY KEY (UserId, TagId)
11+
);
12+
INSERT INTO UserTagQA
13+
( SELECT P.ownerUserId AS UserId,
14+
PT.tagId AS TagId,
15+
sum(CASE P.PostTypeId WHEN 1 THEN 1 ELSE 0 END) AS Questions,
16+
sum(CASE P.PostTypeId WHEN 2 THEN 1 ELSE 0 END) AS Answers
17+
FROM Posts P JOIN PostTags PT ON PT.PostId = P.Id
18+
WHERE P.OwnerUserId IS NOT NULL
19+
GROUP BY P.OwnerUserId, PT.TagId
20+
);
21+
CREATE INDEX usertagqa_questions_idx ON UserTagQA USING btree (Questions)
22+
WITH (FILLFACTOR = 100);
23+
CREATE INDEX usertagqa_answers_idx ON UserTagQA USING btree (Answers)
24+
WITH (FILLFACTOR = 100);
25+
26+
27+
-- QuestionAnswer TABLE
28+
DROP TABLE IF EXISTS QuestionAnswer;
29+
CREATE TABLE QuestionAnswer (
30+
QuestionId int,
31+
AnswerId int,
32+
PRIMARY KEY (QuestionId, AnswerId)
33+
);
34+
INSERT INTO QuestionAnswer
35+
( SELECT P.ParentId as QuestionId, P.Id as AnswerId
36+
FROM Posts P WHERE P.PostTypeId = 2
37+
);
38+
39+
-- Questions VIEW
40+
DROP VIEW IF EXISTS Questions;
41+
CREATE VIEW Questions AS
42+
SELECT Id, AcceptedAnswerId, CreationDate, Score, ViewCount, OwnerUserId,
43+
LastEditorUserId, LastEditorDisplayName, LastEditDate,
44+
LastActivityDate, Title, Tags, AnswerCount, CommentCount,
45+
FavoriteCount, CommunityOwnedDate
46+
FROM Posts
47+
WHERE PostTypeId = 1;
48+
49+
-- Answers VIEW
50+
DROP VIEW IF EXISTS Answers;
51+
CREATE VIEW Answers AS
52+
SELECT Id, ParentId, CreationDate, Score, OwnerUserId, LastEditorUserId,
53+
LastEditorDisplayName, LastEditDate, LastActivityDate,
54+
CommentCount, CommunityOwnedDate
55+
FROM Posts
56+
WHERE PostTypeId = 2;
57+
58+
59+
-- Composite indexes for Posts table
60+
CREATE INDEX posts_id_post_type_id_idx ON Posts USING btree (Id, PostTypeId)
61+
WITH (FILLFACTOR = 100);
62+
CREATE INDEX posts_id_parent_id_idx ON Posts USING btree (Id, ParentId)
63+
WITH (FILLFACTOR = 100);
64+
CREATE INDEX posts_id_accepted_answers_id_idx ON Posts USING btree (Id, AcceptedAnswerId)
65+
WITH (FILLFACTOR = 100);
66+
CREATE INDEX posts_owner_user_id_creation_date_idx ON Posts USING btree (OwnerUserId, CreationDate)
67+
WITH (FILLFACTOR = 100);

0 commit comments

Comments
 (0)