-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmbox-filter.py
executable file
·309 lines (255 loc) · 9.26 KB
/
mbox-filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/env python
"""
/*****************************************************************
* copyright (c) 2010, Michael D. Day
*
* This work is licensed under the GNU GPL, version 2. See
* http://www.gnu.org/licenses/gpl-2.0.txt
*
****************************************************************/
This is an mbox filter. It scans through an entire mbox style mailbox
and writes the messages to a new file. Each message is passed
through a filter function which may modify the document or ignore it.
The passthrough_filter() example below simply prints the 'from' email
address and returns the document unchanged. After running this script
the input mailbox and output mailbox should be identical.
"""
import mailbox, rfc822
import sys, os, string, re, datetime
from optparse import OptionParser
LF = '\x0a'
options = {}
def main ():
global options
usage = "usage: %prog [options] arg"
parser = OptionParser(usage)
parser.add_option("--file", dest="mbox_in",
help="mbox file to read (required)")
parser.add_option("--out", dest="mbox_out", help="output mbox")
parser.add_option("--filter", dest="filter", default="date",
help="filter type (date, header, regexp)")
parser.add_option("--exp", dest="exp",
help="filter expression (depends on filter type)")
parser.add_option("--op", dest="op", default="=", help="filter operand")
parser.add_option("--invert", dest="mbox_invert",
help="invert filter and output to a secondary file")
parser.add_option("--verbose", dest="verbose", action="store_true", default=False)
parser.add_option("--concise", dest="concise", action="store_true", default=False)
(options, args) = parser.parse_args()
if options.mbox_in is None:
parser.print_help()
sys.exit(-1)
if "date" in string.lower(options.filter):
return process_mailbox (date_filter)
if "header" in string.lower(options.filter):
return process_mailbox (header_filter)
if "regexp" in string.lower(options.filter):
return process_mailbox(regexp_filter)
process_mailbox (passthrough_filter)
#datetime.datetime(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]])
def convertStr(s):
"""Convert string to either int or float."""
try:
ret = int(s)
except ValueError:
#Try float.
try:
ret = float(s)
except:
ret = 0
return ret
def get_datetime(msg_date):
""" some messages have date strings that don't start with the day of the week"""
weekday = 0
day = 0
month = 0
year = 0
days = {'Mon':0, 'Tue':1, 'Wed':2, 'Thu':3, 'Fri':4, 'Sat':5, 'Sun':6}
months = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6,
'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12,
'JAN':1, 'FEB':2, 'MAR':3, 'APR':4, 'MAY':5, 'JUN':6,
'JUL':7, 'AUG':8, 'SEP':9, 'OCT':10, 'NOV':11, 'DEC':12}
date_strings = re.split("\W", msg_date)
if len(date_strings) < 3:
return datetime.datetime.today()
# mailers will use a date string like the following, or perhaps
# without the leading day of week
# Sat, 6 Oct 2007 04:39:36 -0700 (PDT)
for num in range(6):
try:
if date_strings[num] in days:
weekday = days[date_strings[0]]
continue
except:
continue
else:
if str.isdigit(date_strings[num]):
if day is 0:
day = convertStr(date_strings[num])
else:
year = convertStr(date_strings[num])
# some git mailer scripts use a two digit year "09"
if year < 2000:
year += 2000
break
else:
if date_strings[num] in months:
month = months[date_strings[num]]
# some mailers use weird date strings like the following:
# 2007 12 02
if month == 0:
year = convertStr(date_strings[0])
month = convertStr(date_strings[1])
day = convertStr(date_strings[2])
try:
msg_datetime = datetime.datetime(year, month, day)
except:
# some mailers use a format like the following
# ??, 19 7 09 (TortoiseGit)
year = convertStr(date_strings[4])
month = convertStr(date_strings[3])
day = convertStr(date_strings[2])
if year < 2000:
year += 2000
try:
msg_datetime = datetime.datetime(year, month, day)
except:
print "error parsing time stamp for message %d %d %d" %(year, month, day)
print msg_date
msg_datetime = datetime.datetime.today()
return msg_datetime
# Thu, 4 Oct 2007 16:56:06 +0100 (BST)
# filter - date
# exp - <>=
# op date string
def date_filter (msg, document):
global options
ret_doc = None
# get the message date
msg_date = msg['Date']
msg_datetime = get_datetime(msg['Date'])
#get the filter date
if options.op is None:
filter_datetime = datetime.datetime.today()
else:
# mm-dd-yyyy
date_strings = re.split('[-//]', options.op)
year = convertStr(date_strings[2])
month = convertStr(date_strings[0])
day = convertStr(date_strings[1])
try:
filter_datetime = datetime.datetime(year, month, day)
except:
print("error parsing date filter expression")
filter_datetime = datetime.datetime.today()
# now we need to evaluate the filter expression against the
# message date header
# msg_datetime exp filter_datetime
if "<" in options.exp:
if msg_datetime < filter_datetime:
ret_doc = document
else:
if "=" in options.exp:
if msg_datetime == filter_datetime:
ret_doc = document
else:
if ">" in options.exp:
if msg_datetime > filter_datetime:
ret_doc = document
if options.verbose is True:
if ret_doc is not None:
print msg_date
return ret_doc
def passthrough_filter (msg, document):
"""This prints the 'from' address of the message and
returns the document unchanged.
"""
from_addr = msg.getaddr('From')[1]
print from_addr
return document
# filter - header
# exp - regexp to filter header contents
# op - name of header
def header_filter (msg, document):
global options
regxp = None
try:
msg_header = msg[options.op]
regxp = re.compile(options.exp, re.I)
except:
print msg_header
return None
if regxp is not None:
if regxp.search(msg_header) is not None:
if options.verbose is True:
print msg_header
return document
return None
# filter - regexp
# exp - regexp to filter document contents
# op - unused
def regexp_filter (msg, document):
global options
regxp = None
try:
regxp = re.compile(options.exp)
except:
return None
if regxp is not None:
if regxp.search(document) is not None:
if options.verbose is True:
try:
print msg['Subject']
except:
print ""
return document
return None
def process_mailbox (filter_function):
"""This processes a each message in the 'in' mailbox and optionally
writes the message to the 'out' mailbox. Each message is passed to
the filter_function. The filter function may return None to ignore
the message or may return the document to be saved in the 'out' mailbox.
See passthrough_filter().
"""
global options
finvert = None
match = 0
if options.mbox_out is None:
fout = sys.stdout
else:
fout = file(options.mbox_out, 'w')
# Open the mailbox.
mb = mailbox.UnixMailbox (file(options.mbox_in,'r'))
if options.mbox_invert is not None:
finvert = file(options.mbox_invert, 'w')
msg = mb.next()
while msg is not None:
# Properties of msg cannot be modified, so we pull out the
# document to handle is separately. We keep msg around to
# keep track of headers and stuff.
document = msg.fp.read()
match = filter_function (msg, document)
if match is not None:
write_message (fout, msg, document)
else:
if finvert is not None:
write_message(finvert, msg, document)
msg = mb.next()
fout.close()
def write_message (fout, msg, document):
"""This writes an 'rfc822' message to a given file in mbox format.
This assumes that the arguments 'msg' and 'document' were generate
by the 'mailbox' module. The important thing to remember is that the
document MUST end with two linefeeds ('\n'). It comes this way from
the mailbox module, so you don't need to do anything if you want to
write it unchanged. If you modified the document then be sure that
it still ends with '\n\n'.
"""
fout.write (msg.unixfrom)
for l in msg.headers:
fout.write (l)
fout.write (LF)
if options.concise is False:
fout.write (document)
if __name__ == '__main__':
main ()