#!/usr/bin/python
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2010 Clay Baenziger. All rights reserved.
#
import time, datetime, re, tempfile, os, shutil
from lxml import html
from lxml.builder import E
from lxml import etree as ET
class PipermailList:
'''
Class for interacting with Pipermail e-mail archives hosted on the WWW in
HTML format
'''
def __init__(self, URL, listName):
# URL for the list archive
self.URL=URL
# The list name (as printed in the e-mail subject line)
self.listName=listName
def searchArchivesBySubject(self,subRE=None):
'''
Function to recursively search archive listings and find any
top-level e-mails matching subject returning a generator of one dict
per message with keys author subject eMailURL date (as a struct_time
object)
'''
# set subRE here so we can include the listName
if subRE is None:
subRE=r'^\['+self.listName+r'\]'
archives=self._getByYearMonth()
for year in archives.iterkeys():
for month in archives[year].iterkeys():
# load this archive page
threadList=html.parse(self.URL+archives[year][month])
# do a regex search for
tags which have tags with the
# link text regex subject (case insensitive) and which are not
# ancestors of another LI element (get top-level posts only)
regexpNS='http://exslt.org/regular-expressions'
query="//ul[not(ancestor::li)]/li/a[@href and following-sibling::i]"
eMails=threadList.xpath(query,
namespaces={'re':regexpNS})
for tag in eMails:
# get the base directory of the archive page
# (i.e. cut off blah.html)
eMailURL=re.sub('/[^/]*\.html$','/',self.URL+archives[year][month])+tag.attrib['href']
eMail=html.parse(eMailURL)
# Find the particular tag which has a title attribute as
# its text will be the author and the title attribute will
# be the subject
authoritativeTag=eMail.xpath('//a[@title]')[0]
author=(authoritativeTag.text).strip()
# strip trailing newline
# strip [ list-name ] from subject
subject=re.sub(r'[\t\n ]',' ',authoritativeTag.attrib['title'].strip().\
lstrip('['+self.listName+'] '))
# date is only in the body of the page and should be the
# first italic entry
# regex looks for entries in the format:
# Wed Apr 2 01:23:57 PDT 2008
query="//body/i[1][re:test(.,'^(Mon|Tue|Wed|Thu|Fri|Sat|Sun) "+\
"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) "+\
"[ 123][0-9] .* [12][0-9]{3}$','i')]"
date=eMail.xpath(query,namespaces={'re':regexpNS})[0].text.strip()
# strip the timezone as Solaris strptime(3) can't seem to
# accept it if it is not our localtz
date=re.sub(r'(:[0-9]{2}) [a-zA-Z]* ([0-9]{4})',
r'\1 \2',date)
date=time.strptime(date, '%a %b %d %H:%M:%S %Y')
# each message is represented by a dict
yield dict(zip(
['author','subject','eMailURL','date'],
[author, subject, eMailURL, date]))
def _getByYearMonth(self):
'''
Function to build a list URLs sorted by year, then month for thread
sorted archives
'''
# download archive page and parse XHTML
archivesPage=html.parse(self.URL)
# table structure for archive list is:
#
# Month Year: |
# ......[ Thread ]...... |
# ... |
#
# every month has a link to show the archive by Thread
# do a regex search for tags which have the text of '^[ Thread ]$'
regexpNS='http://exslt.org/regular-expressions'
monthURLs=archivesPage.xpath("//a[re:test(., '^\[ Thread \]$','')]",
namespaces={'re':regexpNS})
archives=dict()
# iterate over Thread URL a tags
for tag in monthURLs:
# find the ancestor tag which lists the month and year
[month,year]=(tag.getparent()).getparent().xpath('.//td[1]/text()')[0].split()
url=tag.attrib['href']
# make archives a dictionary keyed by year
try:
archives[year.rstrip(':')]
except KeyError:
archives[year.rstrip(':')]=dict()
# make archives[year] a dictionary keyed by month (the number 1-12)
# opposed to the full text (i.e. January)
archives[year.rstrip(':')][time.strptime(month, '%B').tm_mon]=url
# we now have archives[year][month]=url
return archives
def xmlSerialize(self,messageList):
'''
Accept a message list and dump a string XML representation for passing
around
'''
root=E('message-list',list=self.listName,url=self.URL)
for msg in messageList:
root.append(E(
'message',
E('date', time.strftime('%m/%d/%Y %H:%M:%S',msg['date'])),
E('author', msg['author']),
E('subject', msg['subject']),
url=msg['eMailURL']
))
return (root)
if __name__ == '__main__':
"""
Download e-mail from a mailman list to an XML file saving the date and
author and subject for ensuring that the other two appear reasonable
"""
caiman_discuss=PipermailList(URL='http://mail.opensolaris.org/pipermail/caiman-discuss/',listName='caiman-discuss')
root=E("sub",topic="OpenSolaris Install")
root.append(caiman_discuss.xmlSerialize(caiman_discuss.searchArchivesBySubject()))
print ET.tostring(root,pretty_print=True)