Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 28 additions & 14 deletions load_into_pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import psycopg2 as pg
import row_processor as Processor
import six
import json

# Special rules needed for certain tables (esp. for old database dumps)
specialRules = {
Expand All @@ -15,17 +16,17 @@ def _makeDefValues(keys):
"""Returns a dictionary containing None for all keys."""
return dict(( (k, None) for k in keys ))

def _createMogrificationTemplate(table, keys):
def _createMogrificationTemplate(table, keys, insertJson):
"""Return the template string for mogrification for the given keys."""
return ( '(' +
', '.join( [ '%(' + k + ')s' if (table, k) not in specialRules else specialRules[table, k]
for k in keys
]
) +
')'
)

def _createCmdTuple(cursor, keys, templ, attribs):
table_keys = ', '.join( [ '%(' + k + ')s' if (table, k) not in specialRules
else specialRules[table, k]
for k in keys ])
if insertJson:
return ('(' + table_keys + ', %(jsonfield)s' + ')')
else:
return ('(' + table_keys + ')')

def _createCmdTuple(cursor, keys, templ, attribs, insertJson):
"""Use the cursor to mogrify a tuple of data.
The passed data in `attribs` is augmented with default data (NULLs) and the
order of data in the tuple is the same as in the list of `keys`. The
Expand All @@ -34,12 +35,20 @@ def _createCmdTuple(cursor, keys, templ, attribs):
"""
defs = _makeDefValues(keys)
defs.update(attribs)

if insertJson:
dict_attribs = { }
for name, value in attribs.items():
dict_attribs[name] = value
defs['jsonfield'] = json.dumps(dict_attribs)

values_to_insert = cursor.mogrify(templ, defs)
return cursor.mogrify(templ, defs)

def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword):
def handleTable(table, keys, insertJson, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPassword):
"""Handle the table including the post/pre processing."""
dbFile = mbDbFile if mbDbFile is not None else table + '.xml'
tmpl = _createMogrificationTemplate(table, keys)
tmpl = _createMogrificationTemplate(table, keys, insertJson)
start_time = time.time()

try:
Expand Down Expand Up @@ -82,7 +91,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
six.print_('Processing data ...')
for rows in Processor.batch(Processor.parse(xml), 500):
valuesStr = ',\n'.join(
[ _createCmdTuple(cur, keys, tmpl, row_attribs).decode('utf-8')
[ _createCmdTuple(cur, keys, tmpl, row_attribs, insertJson).decode('utf-8')
for row_attribs in rows
]
)
Expand Down Expand Up @@ -159,6 +168,11 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
, default = False
)

parser.add_argument( '-j', '--insert-json'
, help = 'Insert raw data as JSON.'
, action = 'store_true'
, default = False
)
args = parser.parse_args()

table = args.table
Expand Down Expand Up @@ -279,7 +293,7 @@ def handleTable(table, keys, dbname, mbDbFile, mbHost, mbPort, mbUsername, mbPas
choice = input('This will drop the {} table. Are you sure [y/n]?'.format(table))

if len(choice) > 0 and choice[0].lower() == 'y':
handleTable(table, keys, args.dbname, args.file, args.host, args.port, args.username, args.password)
handleTable(table, keys, args.insert_json, args.dbname, args.file, args.host, args.port, args.username, args.password)
else:
six.print_("Cancelled.")

3 changes: 2 additions & 1 deletion sql/Badges_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ CREATE TABLE Badges (
Id int PRIMARY KEY ,
UserId int not NULL ,
Name text not NULL ,
Date timestamp not NULL
Date timestamp not NULL ,
jsonfield jsonb
);
3 changes: 2 additions & 1 deletion sql/Comments_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ CREATE TABLE Comments (
Score int not NULL ,
Text text ,
CreationDate timestamp not NULL ,
UserId int
UserId int ,
jsonfield jsonb
);
3 changes: 2 additions & 1 deletion sql/PostHistory_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ CREATE TABLE PostHistory (
RevisionGUID text ,
CreationDate timestamp not NULL ,
UserId int ,
PostText text
PostText text ,
jsonfield jsonb
);
3 changes: 2 additions & 1 deletion sql/PostLinks_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ CREATE TABLE PostLinks (
CreationDate timestamp not NUll ,
PostId int not NULL ,
RelatedPostId int not NULL ,
LinkTypeId int not Null
LinkTypeId int not Null ,
jsonfield jsonb
);
3 changes: 2 additions & 1 deletion sql/Posts_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ CREATE TABLE Posts (
CommentCount int ,
FavoriteCount int ,
ClosedDate timestamp ,
CommunityOwnedDate timestamp
CommunityOwnedDate timestamp ,
jsonfield jsonb
);

7 changes: 4 additions & 3 deletions sql/Tags_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ DROP TABLE IF EXISTS Tags CASCADE;
CREATE TABLE Tags (
Id int PRIMARY KEY ,
TagName text not NULL ,
Count int,
ExcerptPostId int,
WikiPostId int
Count int ,
ExcerptPostId int ,
WikiPostId int ,
jsonfield jsonb
);
3 changes: 2 additions & 1 deletion sql/Users_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ CREATE TABLE Users (
DownVotes int not NULL ,
ProfileImageUrl text ,
Age int ,
AccountId int -- NULL accountId == deleted account?
AccountId int , -- NULL accountId == deleted account?
jsonfield jsonb
);

3 changes: 2 additions & 1 deletion sql/Votes_pre.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ CREATE TABLE Votes (
VoteTypeId int not NULL ,
UserId int ,
CreationDate timestamp not NULL ,
BountyAmount int
BountyAmount int ,
jsonfield jsonb
);