Blogger: import from XML with images

Google 様の Blogger ですが XML から import しても、img.src とか a.href とか取り込んでくれません。どれが post された image か判別できないんでまあそんなもんと言えばそんなもん。Python 2.7 では動かないかも (mimetypes の返す値が違う)。

#!/usr/bin/env python
#-*- coding:utf-8 -*-
import sys, os, re, urllib2, urlparse, mimetypes
#http://code.google.com/p/gdata-python-client/
import atom
import gdata.photos.service


class BloggerImportHelper(object):
    def __init__(self):
        self.username = self.password = ''
        self.handleUriPrefixes = []

    def __call__(self, xmlcontent):
        feed = atom.FeedFromString(xmlcontent)

        self.pws = gdata.photos.service.PhotosService()
        self.pws.ClientLogin(self.username, self.password)
        for album in self.pws.GetUserFeed().entry:
            if album.title.text == feed.title.text:
                self.album = album
                break
        else:
            self.album = self.pws.InsertAlbum(feed.title.text, '')

        self.cache = {}
        for entry in feed.entry:
            if entry.content.text:
                entry.content.text = self.processContent(entry.content.text)

        return feed.ToString()

    def processContent(self, content):
        return re.sub(
            'https?://[^"\']+', lambda m: self.processUri(m.group(0)), content)

    def processUri(self, uri):
        if uri in self.cache:
            return self.cache[uri]
        for prefix in self.handleUriPrefixes:
            if uri.startswith(prefix):
                break
        else:
            return uri
        basename = urlparse.urlparse(uri).path.rsplit('/', 1)[1]
        try:
            photo = self.pws.InsertPhotoSimple(
                self.album.GetPhotosUri(),
                basename,
                '',
                urllib2.urlopen(uri),
                mimetypes.guess_type('mime.' + basename.rsplit('.', 1)[1])[0],
                )
        except Exception as e:
            #TODO: check retry condition
            print uri, e
            return self.processUri(uri)
        self.cache[uri] = photo.media.content[0].url
        return self.cache[uri]


if __name__ == '__main__':
    converter = BloggerImportHelper()
    # Picasa username and password
    converter.username = 'you@gmail.com'
    converter.password = 'password'
    # URI prefix list to upload
    converter.handleUriPrefixes = [
        'http://localhost/images/',
        ]
    # sys.argv[1] is XML file (Blogger atom feed)
    i = open(sys.argv[1], 'rb')
    o = open('test.atom', 'wb')
    o.write(converter(i.read()))