python抓取luoo站点专辑
  

python抓取luoo站点专辑

#!/usr/bin/env python2
# vim: set fileencoding=utf8

import sys import re import urllib import urllib2 import os import random

# for format string def format_string(s): return '{:0>2d}'.format(s)

s = u'\x1b[1;%dm%s\x1b[0m' # terminual color template

luo_domain = 'http://www.luoo.net/music/{id}' download_domain = 'http://luoo.800edu.net/low/luoo/radio{id}/' thumb_580_url = 'http://img.luoo.net/pics/albums/{song_id}/cover.jpg_580x580.jpg' path = '/Users/mac/Develop/cygnuss/luoo/'

album_id = 30 # define album id

album_section= {}

response = urllib2.urlopen(luo_domain.replace('{id}', str(album_id))) html = response.read()

p = re.compile(r'<span class="vol-number rounded">([^/]*)</span>') vol = p.findall(html)

vol = vol[0].strip() # vol .100 dir_ = path + 'vol.' + vol

if not os.path.exists(dir): os.mkdir(dir)

p = re.compile(r'<img src="http://img2.luoo.net/pics/albums/(.+)/cover.jpg_580x580.jpg" alt=".+" class="cover rounded">') songs = p.findall(html)

p = re.compile(r'<a href="javascript:;" rel="nofollow" class="trackname btn-play">(.+)</a>') song_titles = p.findall(html)

p = re.compile(r'<span class="vol-number rounded">.+</span>\s*<span class="vol-title">(.+)</span>') album_title = p.findall(html)

count = 0 for song in songs: title = song_titles[count] num = '{:0>2d}'.format(count + 1) thumb = thumb_580_url.replace('{song_id}', song) download_domain = download_domain.replace('{id}' , str(album_id)) url = download_domain + num + '.mp3' album_section[count] = {'id': song, 'title': title, 'thumb': thumb, 'url': url} count = count + 1

print '开始下载 ' + '《' + album_title[0] + '》'

amount_songs = len(album_section) for key in album_section: num = random.randint(0, 100) % 7 col = s % (num + 90, album_section[key]['title'].decode('utf-8')) print(u'\n ++ 正在下载: #%s/%s# %s' % (key+1 , amount_songs , col)) song_path = dir + '/' + album_section[key]['title'] + '.mp3' img_path = dir + '/' + album_section[key]['title'] + '.jpg' urllib.urlretrieve(album_section[key]['url'], song_path) urllib.urlretrieve(album_section[key]['thumb'], img_path)

back to the top