3

I am trying to get some data out of text file with the following format:

jvm: 2011-08-29 17:09:54.438864:
    MemoryStatistics: [290328680, 381288448]

moniData: 2011-08-29 17:09:54.438864:
    Depth: [0]
    RecordsSent: [1]

rdoutData: 2011-08-29 17:09:54.438864:
    Depth: [0]
    RecordsSent: [0]

rdoutReq: 2011-08-29 17:09:54.438864:
    TotalRecordsReceived: 132
    RecordsReceived: [132]
    BytesReceived: [8184]

sender: 2011-08-29 17:09:54.438864:
    NumReadoutRequestsReceived: 178
    NumHitsReceived: 2663
    NumReadoutsSent: 1
    NumHitsCached: 0
    NumHitsQueued: 310
    NumReadoutRequestsQueued: 0

snData: 2011-08-29 17:09:54.438864:
    Depth: [0]
    RecordsSent: [61]

stringHit: 2011-08-29 17:09:54.438864:
    Depth: [8]
    RecordsSent: [3026]

stringhub: 2011-08-29 17:09:54.438864:
    TimeOfLastHitOutputFromHKN1: 207977962295545677
    NumberOfActiveAndTotalChannels: [60, 60]
    NumberOfActiveChannels: 60
    TimeOfLastHitInputToHKN1: 207977964479700660
    HitRateLC: 0.0
    HitRate: 0.0
    TotalLBMOverflows: 1091

system: 2011-08-29 17:09:54.438864:
    LoadAverage: [0.0, 0.02, 0.35999999999999999]
    NetworkIO: {'lo_tx_errs': 0, 'eth1_rx_fifo': 0, 'eth2_rx_packets': 0, 'eth1_tx_compressed': 0, 'eth2_tx_compressed': 0, 'eth0_tx_fifo': 0, 'eth1_tx_packets': 0, 'lo_tx_compressed': 0, 'eth1_rx_compressed': 0, 'lo_rx_errs': 0, 'eth1_tx_fifo': 0, 'lo_tx_fifo': 0, 'eth0_tx_errs': 0, 'eth0_rx_multicast': 0, 'eth0_tx_carrier': 0, 'eth3_rx_compressed': 0, 'eth3_tx_drop': 0, 'lo_tx_drop': 0, 'eth2_rx_drop': 0, 'eth1_tx_drop': 0, 'eth3_rx_bytes': 0, 'eth3_tx_packets': 0, 'lo_rx_bytes': 8270472, 'eth2_rx_errs': 0, 'eth3_tx_errs': 0, 'eth0_rx_errs': 0, 'eth2_tx_errs': 0, 'lo_rx_packets': 71359, 'eth2_rx_compressed': 0, 'eth3_rx_packets': 0, 'eth0_tx_drop': 0, 'eth0_rx_frame': 0, 'eth1_tx_bytes': 0, 'eth1_rx_multicast': 0, 'eth1_rx_packets': 0, 'eth2_tx_fifo': 0, 'eth1_tx_errs': 0, 'eth2_tx_bytes': 0, 'eth3_rx_frame': 0, 'eth2_rx_frame': 0, 'eth1_rx_bytes': 0, 'eth0_rx_drop': 0, 'eth3_rx_drop': 0, 'eth1_rx_frame': 0, 'eth2_tx_packets': 0, 'eth0_tx_bytes': 389183382674, 'eth3_rx_errs': 0, 'eth0_rx_bytes': 141781372747, 'eth3_tx_compressed': 0, 'eth2_rx_fifo': 0, 'lo_tx_bytes': 8270472, 'eth1_rx_errs': 0, 'eth1_tx_carrier': 0, 'eth0_rx_packets': 478007025, 'lo_rx_drop': 0, 'eth0_tx_compressed': 0, 'eth0_rx_fifo': 0, 'eth3_tx_colls': 0, 'eth0_tx_colls': 0, 'lo_tx_packets': 71359, 'eth2_rx_multicast': 0, 'eth2_tx_colls': 0, 'eth3_tx_fifo': 0, 'eth1_tx_colls': 0, 'lo_tx_carrier': 0, 'lo_rx_frame': 0, 'eth1_rx_drop': 0, 'lo_tx_colls': 0, 'eth3_tx_bytes': 0, 'lo_rx_fifo': 0, 'eth2_tx_drop': 0, 'eth3_tx_carrier': 0, 'eth3_rx_multicast': 0, 'eth0_rx_compressed': 0, 'eth2_rx_bytes': 0, 'eth2_tx_carrier': 0, 'eth0_tx_packets': 1197286889, 'lo_rx_multicast': 0, 'lo_rx_compressed': 0, 'eth3_rx_fifo': 0}
    AvailableDiskSpace: {'/': 43836096, '/dev/shm': 24725760}

tcalData: 2011-08-29 17:09:54.438864:
    Depth: [0]
    RecordsSent: [0]

PyrateBufferManager: 2011-08-29 17:09:57.031479:
    CurrentAquiredBuffers: 0
    ReturnBufferCount: 4285
    CurrentAquiredBytes: 0

I am trying to get a plot that is time versus the various quantities stored in the file, so basically I would like to get out a couple arrays that are similar to this format:

timestamp=[...,17:09:54.438864,...]
snDataDepth=[..,0,...]
snDataRecordsSend=[..., 61,...]

I got recommendations by other people to use something like

f = open(file, "r").readlines()
dummy=[]
for i in f:
    dummy.append(i.split("[")[1].split("]")[0])

to get the numbers our. I am having trouble getting the data sorted properly using the categories in the text (see above) and the timestamp.

Thanks in advance for any help

As requested:

Already used that to get some other data out:

#!/usr/bin/env python
import sys, os, re
import numpy as np
import pylab as py

def main():
    snrate=[]
    PyrateBufferManagerCABu=[]
    PyrateBufferManagerRBC=[]
    PyrateBufferManagerCABy=[]
    end=".moni;.log"
    for i in range(1,len(sys.argv)):
        file = os.path.splitext(sys.argv[i])[0]
        ext = os.path.splitext(sys.argv[i])[1]
        print file
        if 'log' in ext:
            f=open(sys.argv[i], 'rU')
            dummy=[]
            dummy1=[]
            for line in f:
                dummy += re.findall('snRate: (\d.?\d+)', line)
                dummy1 += re.findall('Buffer overflow in SN record channel:', line)
            snrate.append(dummy)
            print ext
        if 'moni' in ext:
            f=open(sys.argv[i], 'rU').readlines()
            #print f
            timestamp=[]
            dummy=[]
            count=0
            for line in f:
                timestamp += re.findall(r'\d\d:\S+:\d\d.\d\d',line)
                if 'PyrateBufferManager' in line:
                    PyrateBufferManagerCABu += re.findall(r'\S+-\S+-(CurrentAquiredBuffers)\:\s\d+', line[count+1])
                    print PyrateBufferManagerCABu
                    PyrateBufferManagerRBC += re.findall(r'ReturnBufferCount:\s\S+',line[count+2])
                    PyrateBufferManagerCABy += re.findall(r'CurrentAquiredBytes:\s\S+', line[count+3])
            timestamp=list(set(timestamp))
            timestamp.sort()
            print PyrateBufferManagerCABu, PyrateBufferManagerRBC
            print ext

4 Answers 4

1

You can use python's builtin lib re for regular expressions. To get all of the timestamps you can do something like this:

import re

def main():
    file=open(file)
    timestamp=(re.findall(r'\S+-\S+-\d\d\s\d\d:\S+:\S+:',file.read()))
    print timestamp

if __name__=='__main__':
    main()
Sign up to request clarification or add additional context in comments.

2 Comments

Already used that to get some other data out: #!/usr/bin/env python import sys, os, re import numpy as np import pylab as py snrate=[] for i in range(1,len(sys.argv)): print i f=open(sys.argv[i], 'rU') dummy=[] dummy1=[] if f: for line in f: dummy += re.findall('snRate: (\d.?\d+)', line) dummy1 += re.findall('Buffer overflow in SN record channel:', line) print len(dummy) print len(dummy1) snrate.append(dummy)
@madtowneast I find it ironic that it is impossible to post a legal Python program in the comment field here due to the whitespace inanity.
1

Why not simplicity ? :

import re

regx = re.compile('snData: +(\d{4}-\d\d-\d\d +\d\d:\d\d:\d\d\.\d+).+'
                  'Depth: +\[(\d+)\].+'
                  'RecordsSent: +\[(\d+)\]', re.DOTALL)

timestamp, snDataDepth, snDataRecordsSend = [], [], []

with open('data_for_plot.txt') as f:
    ch = f.read()

a,b,c = regx.search(ch).groups()

timestamp.append(a)
snDataDepth.append(b)
snDataRecordsSend.append(c)

print timestamp
print snDataDepth
print snDataRecordsSend

result

['2011-08-29 17:09:54.438864']
['0']
['0']

Comments

1

You could parse your input data using lepl:

#!/usr/bin/env python
import ast
import fileinput
import logging

from datetime import datetime
from pprint   import pprint
from string   import ascii_letters, digits

from lepl import Any, Iterate, Newline, Regexp, SkipTo, Space

# ABNF: property = name colon python_literal
name = Any(ascii_letters+digits)[1:,...] # \w+
colon = Space()[:,...] & ':' & Space()[:,...] # \s*:\s*
python_literal = Regexp(r'.+') >> ast.literal_eval
property_ = name & ~colon & python_literal > tuple

# record consists of name, timestamp and one or more properties
#   ABNF: record = name colon timestamp colon 1*( NEWLINE indent property )
timestamp = Regexp(r'.*[^\s:]') >> (lambda s: (
    datetime.strptime(s, "%Y-%m-%d %H:%M:%S.%f")))
record = (name & ~colon & timestamp & ~colon &
          ((~Newline() & ~Space()[1:,...] & property_)[1:] > dict)) > tuple

# file consists of one or more records interlaced with newlines
#   ABNF: file = 1*( NEWLINE | record )
#     skip unrecognized text upto new line 
unknown = SkipTo(Newline()) > (lambda s: logging.error('unknown: %r' % (s,)))
it = Iterate(record | ~Newline() | ~unknown) # consume input one record at a time
it.config.no_full_first_match().no_memoize() # improve performance

iterparse = it.get_parse_file_all() # output one record at a time
pprint([lst[0] for lst in iterparse(fileinput.input()) if lst])

Output

[('jvm',
  datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
  {'MemoryStatistics': [290328680, 381288448]}),
 ('moniData',
  datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
  {'Depth': [0], 'RecordsSent': [1]}),
 ('rdoutData',
  datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
  {'Depth': [0], 'RecordsSent': [0]}),
 ('rdoutReq',
  datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
  {'BytesReceived': [8184],
   'RecordsReceived': [132],
   'TotalRecordsReceived': 132}),
 ('sender',
  datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
  {'NumHitsCached': 0,
   'NumHitsQueued': 310,
   'NumHitsReceived': 2663,
   'NumReadoutRequestsQueued': 0,
   'NumReadoutRequestsReceived': 178,
   'NumReadoutsSent': 1}),
 ('snData',
  datetime.datetime(2011, 8, 29, 17, 9, 54, 438864),
  {'Depth': [0], 'RecordsSent': [61]}),
 # ... snip ...
 ('PyrateBufferManager',
  datetime.datetime(2011, 8, 29, 17, 9, 57, 31479),
  {'CurrentAquiredBuffers': 0,
   'CurrentAquiredBytes': 0,
   'ReturnBufferCount': 4285})]

7 Comments

That looks really good. I have a question though what is the out? I see a lot of tuples, lists within a list? Wouldnt a dictionary within a dictionary work too?
@madtowneast: yes. dict works too (group a record into 2-item tuple: I don't know what you prefer: (name, timestamp), properties or name, (timestamp, properties)). You could use collections.OrderedDict() to preserve records order (the same goes for properties).
I was thinking about {timestamp: {name: {properties:}}} I will try adjust your code that way.
Would it just be s=open(file, 'rU').readlines()? Just getting confused about to go through the file.
@madtowneast: use .parse_file()
|
0

You could use yaml to parse your file if you transform it slightly:

#!/usr/bin/env python
import datetime
import re

import yaml

text = open('input').read()

# transform text to make it a valid yaml
re_name = re.compile(r'^(\w+\:)\s', re.MULTILINE)
yaml_text = re.sub(re_name, r'\1\n ', text)

#
obj = yaml.load(yaml_text)
d = obj['sender'][datetime.datetime(2011, 8, 29, 17, 9, 54, 438864)]
print('number of hits: {NumHitsQueued}'.format(**d))

Output

number of hits: 310

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.