python抓取luoo站点专辑

2014-08-18
#!/usr/bin/env python2
# vim: set fileencoding=utf8

import sys
import re
import urllib
import urllib2
import os
import random

# for format string
def format_string(s):
    return '{:0>2d}'.format(s)

s = u'\x1b[1;%dm%s\x1b[0m'       # terminual color template

luo_domain = 'http://www.luoo.net/music/{id}'
download_domain = 'http://luoo.800edu.net/low/luoo/radio{id}/'
thumb_580_url = 'http://img.luoo.net/pics/albums/{song_id}/cover.jpg_580x580.jpg'
path = '/Users/mac/Develop/cygnuss/luoo/'

album_id = 30 # define album id

album_section= {}

response = urllib2.urlopen(luo_domain.replace('{id}', str(album_id)))
html = response.read()

p = re.compile(r'<span class="vol-number rounded">([^/]*)</span>')
vol = p.findall(html)

vol = vol[0].strip() # vol .100
dir_ = path + 'vol.' + vol

if not os.path.exists(dir):
    os.mkdir(dir)

p = re.compile(r'<img src="http://img2.luoo.net/pics/albums/(.+)/cover.jpg_580x580.jpg" alt=".+" class="cover rounded">')
songs = p.findall(html)

p = re.compile(r'<a href="javascript:;" rel="nofollow" class="trackname btn-play">(.+)</a>')
song_titles = p.findall(html)

p = re.compile(r'<span class="vol-number rounded">.+</span>\s*<span class="vol-title">(.+)</span>')
album_title = p.findall(html)

count = 0
for song in songs:
    title = song_titles[count]
    num  = '{:0>2d}'.format(count + 1)
    thumb = thumb_580_url.replace('{song_id}', song)
    download_domain = download_domain.replace('{id}' , str(album_id))
    url = download_domain + num + '.mp3'
    album_section[count] = {'id': song, 'title': title, 'thumb': thumb, 'url': url}
    count = count + 1

print '开始下载 ' + '《' + album_title[0] + '》'

amount_songs = len(album_section)
for key in album_section:
    num = random.randint(0, 100) % 7
    col = s % (num + 90, album_section[key]['title'].decode('utf-8'))
    print(u'\n  ++ 正在下载: #%s/%s# %s' % (key+1 , amount_songs , col))
    song_path = dir + '/' + album_section[key]['title'] + '.mp3'
    img_path = dir + '/' + album_section[key]['title'] + '.jpg'
    urllib.urlretrieve(album_section[key]['url'], song_path)
    urllib.urlretrieve(album_section[key]['thumb'], img_path)