This commit is contained in:
Peter Molnar 2018-02-28 20:18:59 +00:00 committed by GitHub
commit de14c33321

363
logs2pidgin.py Normal file
View file

@ -0,0 +1,363 @@
import os
import sqlite3
import logging
import re
import glob
import sys
import hashlib
import arrow
import argparse
from pprint import pprint
def logfilename(dt, nulltime=False):
if nulltime:
t = '000000'
else:
t = dt.format('HHmmss')
return "%s.%s%s%s.txt" % (
dt.format("YYYY-MM-DD"),
t,
dt.datetime.strftime("%z"),
dt.datetime.strftime("%Z")
)
def logappend(fpath,dt,sender,msg):
logging.debug('appending log: %s' % (fpath))
with open(fpath, 'at') as f:
f.write("(%s) %s: %s\n" % (
dt.format('YYYY-MM-DD HH:mm:ss'),
sender,
msg
))
os.utime(fpath, (dt.timestamp, dt.timestamp))
os.utime(os.path.dirname(fpath), (dt.timestamp, dt.timestamp))
def logcreate(fpath,contact, dt,account,plugin):
logging.info('creating converted log: %s' % (fpath))
if not os.path.exists(fpath):
with open(fpath, 'wt') as f:
f.write("Conversation with %s at %s on %s (%s)\n" % (
contact,
dt.format('ddd dd MMM YYYY hh:mm:ss A ZZZ'),
account,
plugin
))
def do_msnplus(msgpluslogs, logpathbase, msgplustz):
from bs4 import BeautifulSoup
NOPAR = re.compile(r'\((.*)\)')
NOCOLON = re.compile(r'(.*):?')
searchin = os.path.join(
msgpluslogs,
'**',
'*.html'
)
logs = glob.glob(searchin, recursive=True)
plugin = 'msn'
for log in logs:
logging.info('converting log file: %s' % (log))
contact = os.path.basename(os.path.dirname(log))
with open(log, 'rt', encoding='UTF-16') as f:
html = BeautifulSoup(f.read(), "html.parser")
account = html.find_all('li', attrs={'class':'in'}, limit=1)[0]
account = NOPAR.sub('\g<1>', account.span.string)
for session in html.findAll(attrs={'class': 'mplsession'}):
dt = arrow.get(
session.get('id').replace('Session_', ''),
'YYYY-MM-DDTHH-mm-ss'
)
dt = dt.replace(tzinfo=msgplustz)
seconds = int(dt.format('s'))
fpath = os.path.join(
logpathbase,
plugin,
account,
contact,
logfilename(dt)
)
if not os.path.isdir(os.path.dirname(fpath)):
os.makedirs(os.path.dirname(fpath))
for line in session.findAll('tr'):
if seconds == 59:
seconds = 0
else:
seconds = seconds + 1
tspan = line.find(attrs={'class': 'time'}).extract()
time = tspan.string.replace('(', '').replace(')','').strip().split(':')
sender = line.find('th').string
if not sender:
continue
sender = sender.strip().split(':')[0]
msg = line.find('td').get_text()
mindt = dt.replace(
hour=int(time[0]),
minute=int(time[1]),
second=int(seconds)
)
logcreate(fpath, contact, dt, account, plugin)
logappend(fpath, mindt, sender, msg)
def do_trillian(trillianlogs, logpathbase, trilliantz):
SPLIT_SESSIONS = re.compile(
r'^Session Start\s+\((?P<participants>.*)?\):\s+(?P<timestamp>[^\n]+)'
r'\n(?P<session>(?:.|\n)*?)(?=Session)',
re.MULTILINE
)
SPLIT_MESSAGES = re.compile(
r'\[(?P<time>[^\]]+)\]\s+(?P<sender>.*?):\s+'
r'(?P<msg>(?:.|\n)*?)(?=\n\[|$)'
)
searchin = os.path.join(
trillianlogs,
'**',
'*.log'
)
logs = glob.glob(searchin, recursive=True)
for log in logs:
if 'Channel' in log:
logging.warn(
"Group conversations are not supported yet, skipping %s" % log
)
continue
logging.info('converting log file: %s' % (log))
contact = os.path.basename(log).replace('.log', '')
plugin = os.path.basename(os.path.dirname(os.path.dirname(log))).lower()
c = ''
try:
with open(log, 'rt') as f:
c = f.read()
except UnicodeDecodeError:
with open(log, 'rt', encoding = "ISO-8859-1") as f:
c = f.read()
for session in SPLIT_SESSIONS.findall(c):
participants, timestamp, session = session
logging.debug('converting session starting at: %s' % (timestamp))
participants = participants.split(':')
account = participants[0]
dt = arrow.get(timestamp, 'ddd MMM DD HH:mm:ss YYYY')
dt = dt.replace(tzinfo=trilliantz)
fpath = os.path.join(
logpathbase,
plugin,
participants[0],
contact,
logfilename(dt)
)
if not os.path.isdir(os.path.dirname(fpath)):
os.makedirs(os.path.dirname(fpath))
seconds = int(dt.format('s'))
curr_mindt = dt
for line in SPLIT_MESSAGES.findall(session):
# this is a fix for ancient trillian logs where seconds
# were missing
if seconds == 59:
seconds = 0
else:
seconds = seconds + 1
time, sender, msg = line
try:
mindt = arrow.get(time,
'YYYY.MM.DD HH:mm:ss')
except:
time = time.split(':')
mindt = dt.replace(
hour=int(time[0]),
minute=int(time[1]),
second=int(seconds)
)
# creating the filw with the header has to be here to
# avoid empty or status-messages only files
logcreate(fpath, participants[1], dt, account, plugin)
# logging.info('creating converted log: %s' % (fpath))
# if not os.path.exists(fpath):
# with open(fpath, 'wt') as f:
# f.write("Conversation with %s at %s on %s (%s)\n" % (
# ,
# dt.format('ddd dd MMM YYYY hh:mm:ss A ZZZ'),
# account,
# plugin
# ))
logappend(fpath, mindt, sender, msg)
# with open(fpath, 'at') as f:
# f.write("(%s) %s: %s\n" % (
# mindt.format('YYYY-MM-DD HH:mm:ss'),
# sender,
# msg
# ))
# os.utime(fpath, (mindt.timestamp, mindt.timestamp))
# os.utime(os.path.dirname(fpath), (mindt.timestamp, mindt.timestamp))
if params.get('cleanup'):
print('deleting old log: %s' % (log))
os.unlink(log)
def do_skype(skypedbpath, logpathbase):
db = sqlite3.connect(skypedbpath)
cursor = db.cursor()
cursor.execute('''SELECT `skypename` from Accounts''')
accounts = cursor.fetchall()
for account in accounts:
account = account[0]
cursor.execute('''
SELECT
`timestamp`,
`dialog_partner`,
`author`,
`from_dispname`,
`body_xml`
FROM
`Messages`
WHERE
`chatname` LIKE ?
ORDER BY
`timestamp` ASC
''', ('%' + account + '%',))
messages = cursor.fetchall()
for r in messages:
dt = arrow.get(r[0])
dt = dt.replace(tzinfo='UTC')
fpath = os.path.join(
logpathbase,
account,
r[1],
logfilename(dt, nulltime=True)
)
if not os.path.isdir(os.path.dirname(fpath)):
os.makedirs(os.path.dirname(fpath))
logcreate(fpath, r[1], dt, account, 'skype')
# if not os.path.exists(fpath):
# with open(fpath, 'wt') as f:
# f.write("Conversation with %s at %s on %s (skype)\n" % (
# r[1],
# dt.format('ddd dd MMM YYYY hh:mm:ss A ZZZ'),
# account
# ))
logappend(fpath, dt, r[3], r[4])
# with open(fpath, 'at') as f:
# f.write("(%s) %s: %s\n" % (
# dt.format('YYYY-MM-DD HH:mm:ss'),
# r[3],
# r[4]
# ))
# os.utime(fpath, (dt.timestamp, dt.timestamp))
# os.utime(os.path.dirname(fpath), (dt.timestamp, dt.timestamp))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Parameters for Skype v2 logs to Pidgin logs converter')
parser.add_argument(
'--skype_db',
default=os.path.expanduser('~/.skype/main.db'),
help='absolute path to skype main.db'
)
parser.add_argument(
'--pidgin_logs',
default=os.path.expanduser('~/.purple/logs/skype'),
help='absolute path to Pidgin skype logs'
)
parser.add_argument(
'--loglevel',
default='warning',
help='change loglevel'
)
for allowed in ['skype', 'trillian', 'msnplus']:
parser.add_argument(
'--%s' % allowed,
action='store_true',
default=False,
help='convert %s logs' % allowed
)
if allowed != 'skype':
parser.add_argument(
'--%s_logs' % allowed,
default=os.path.expanduser('~/.%s/logs' % allowed),
help='absolute path to %s logs' % allowed
)
parser.add_argument(
'--%s_timezone' % allowed,
default='UTC',
help='timezone name for %s logs (eg. US/Pacific)' % allowed
)
params = vars(parser.parse_args())
# remove the rest of the potential loggers
while len(logging.root.handlers) > 0:
logging.root.removeHandler(logging.root.handlers[-1])
LLEVEL = {
'critical': 50,
'error': 40,
'warning': 30,
'info': 20,
'debug': 10
}
logging.basicConfig(
level=LLEVEL[params.get('loglevel')],
format='%(asctime)s - %(levelname)s - %(message)s'
)
if params.get('skype'):
logging.info('Skype enabled; parsing skype logs')
do_skype(
params.get('skype_db'),
params.get('pidgin_logs')
)
if params.get('trillian'):
logging.info('Trillian enabled; parsing trillian logs')
do_trillian(
params.get('trillian_logs'),
params.get('pidgin_logs'),
params.get('trillian_timezone'),
)
if params.get('msnplus'):
logging.info('MSN Plus! enabled; parsing logs')
do_msnplus(
params.get('msnplus_logs'),
params.get('pidgin_logs'),
params.get('msnplus_timezone'),
)