A technique I sometimes use (e.g. to filter faulty data, and when none of the other wonderful capabilities of pandas.read_csv() seem to address the case at hand) is to define a io.TextIOWrapper.
In your case, you could write:
class SkipUntilMatchWrapper(io.TextIOWrapper):
def __init__(self, f, matcher, include_matching=False):
super().__init__(f, line_buffering=True)
self.f = f
self.matcher = matcher
self.include_matching = include_matching
self.has_matched = False
def read(self, size=None):
while not self.has_matched:
line = self.readline()
if self.matcher(line):
self.has_matched = True
if self.include_matching:
return line
return super().read(size)
Let's try it on a simple example:
# make an example
with open('sample.csv', 'w') as f:
print('garbage 1', file=f)
print('garbage 2', file=f)
print('and now for some data', file=f)
print('a,b,c', file=f)
x = np.random.randint(0, 10, size=(5, 3))
np.savetxt(f, x, fmt='%d', delimiter=',')
Read:
with open('sample.csv', 'rb') as f_orig:
with SkipUntilMatchWrapper(f_orig, lambda s: 'a,b,c' in s, include_matching=True) as f:
df = pd.read_csv(f)
>>> df
a b c
0 2 7 8
1 7 3 3
2 3 6 9
3 0 6 0
4 4 0 9
Another way:
with open('sample.csv', 'rb') as f_orig:
with SkipUntilMatchWrapper(f_orig, lambda s: 'for some data' in s) as f:
df = pd.read_csv(f)
>>> df
a b c
0 2 7 8
1 7 3 3
2 3 6 9
3 0 6 0
4 4 0 9