Last active
February 11, 2016 23:11
-
-
Save thiagomarzagao/1fa0c6776ab207a7a086 to your computer and use it in GitHub Desktop.
HTML that shows mangled code block
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<div class="highlight"><pre><code class="language-python" data-lang="python"><table style="border-spacing: 0"><tbody><tr><td class="gutter gl" style="text-align: right"><pre class="lineno">1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | |
25 | |
26 | |
27 | |
28 | |
29 | |
30 | |
31 | |
32 | |
33 | |
34 | |
35 | |
36 | |
37 | |
38 | |
39 | |
40 | |
41 | |
42 | |
43 | |
44 | |
45 | |
46 | |
47 | |
48 | |
49 | |
50 | |
51 | |
52 | |
53 | |
54 | |
55 | |
56 | |
57 | |
58 | |
59 | |
60 | |
61 | |
62 | |
63 | |
64 | |
65 | |
66 | |
67 | |
68 | |
69 | |
70 | |
71 | |
72 | |
73 | |
74 | |
75 | |
76 | |
77 | |
78 | |
79 | |
80 | |
81 | |
82 | |
83 | |
84 | |
85 | |
86 | |
87 | |
88</pre></td><td class="code"><pre><span class="s">''' | |
scrape lyrics from vagalume.com.br | |
(author: thiagomarzagao.com) | |
'''</span> | |
<span class="kn">import</span> <span class="nn">json</span> | |
<span class="kn">import</span> <span class="nn">time</span> | |
<span class="kn">import</span> <span class="nn">pickle</span> | |
<span class="kn">import</span> <span class="nn">requests</span> | |
<span class="kn">from</span> <span class="nn">bs4</span> <span class="kn">import</span> <span class="n">BeautifulSoup</span> | |
<span class="c"># get each genre's URL</span> | |
<span class="n">basepath</span> <span class="o">=</span> <span class="s">'http://www.vagalume.com.br'</span> | |
<span class="n">r</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">basepath</span> <span class="o">+</span> <span class="s">'/browse/style/'</span><span class="p">)</span> | |
<span class="n">soup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">text</span><span class="p">)</span> | |
<span class="n">genres</span> <span class="o">=</span> <span class="p">[</span><span class="s">u'Rock'</span><span class="p">]</span> | |
<span class="s">u'Ax</span><span class="se">\u00E9</span><span class="s">'</span><span class="p">,</span> | |
<span class="s">u'Forr</span><span class="se">\u00F3</span><span class="s">'</span><span class="p">,</span> | |
<span class="s">u'Pagode'</span><span class="p">,</span> | |
<span class="s">u'Samba'</span><span class="p">,</span> | |
<span class="s">u'Sertanejo'</span><span class="p">,</span> | |
<span class="s">u'MPB'</span><span class="p">,</span> | |
<span class="s">u'Rap'</span><span class="p">]</span> | |
<span class="n">genre_urls</span> <span class="o">=</span> <span class="p">{}</span> | |
<span class="k">for</span> <span class="n">genre</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">:</span> | |
<span class="n">genre_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">]</span> <span class="o">=</span> <span class="n">soup</span><span class="o">.</span><span class="n">find</span><span class="p">(</span><span class="s">'a'</span><span class="p">,</span> <span class="n">class_</span> <span class="o">=</span> <span class="s">'eA'</span><span class="p">,</span> <span class="n">text</span> <span class="o">=</span> <span class="n">genre</span><span class="p">)</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">'href'</span><span class="p">)</span> | |
<span class="c"># get each artist's URL, per genre</span> | |
<span class="n">artist_urls</span> <span class="o">=</span> <span class="p">{</span><span class="n">e</span><span class="p">:</span> <span class="p">[]</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">}</span> | |
<span class="k">for</span> <span class="n">genre</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">:</span> | |
<span class="n">r</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">basepath</span> <span class="o">+</span> <span class="n">genre_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">])</span> | |
<span class="n">soup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">text</span><span class="p">)</span> | |
<span class="n">counter</span> <span class="o">=</span> <span class="mi">0</span> | |
<span class="k">for</span> <span class="n">artist</span> <span class="ow">in</span> <span class="n">soup</span><span class="o">.</span><span class="n">find_all</span><span class="p">(</span><span class="s">'a'</span><span class="p">,</span> <span class="n">class_</span> <span class="o">=</span> <span class="s">'top'</span><span class="p">):</span> | |
<span class="n">counter</span> <span class="o">+=</span> <span class="mi">1</span> | |
<span class="k">print</span> <span class="s">'artist {} </span><span class="se">\r</span><span class="s">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">counter</span><span class="p">)</span> | |
<span class="n">artist_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">basepath</span> <span class="o">+</span> <span class="n">artist</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s">'href'</span><span class="p">))</span> | |
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> <span class="c"># don't reduce the 2-second wait (here or below) or you get errors</span> | |
<span class="c"># get each lyrics, per genre</span> | |
<span class="n">api</span> <span class="o">=</span> <span class="s">'http://api.vagalume.com.br/search.php?musid='</span> | |
<span class="n">genre_lyrics</span> <span class="o">=</span> <span class="p">{</span><span class="n">e</span><span class="p">:</span> <span class="p">{}</span> <span class="k">for</span> <span class="n">e</span> <span class="ow">in</span> <span class="n">genres</span><span class="p">}</span> | |
<span class="k">for</span> <span class="n">genre</span> <span class="ow">in</span> <span class="n">artist_urls</span><span class="p">:</span> | |
<span class="k">print</span> <span class="nb">len</span><span class="p">(</span><span class="n">artist_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">])</span> | |
<span class="n">counter</span> <span class="o">=</span> <span class="mi">0</span> | |
<span class="n">artist1</span> <span class="o">=</span> <span class="bp">None</span> | |
<span class="k">for</span> <span class="n">url</span> <span class="ow">in</span> <span class="n">artist_urls</span><span class="p">[</span><span class="n">genre</span><span class="p">]:</span> | |
<span class="n">success</span> <span class="o">=</span> <span class="bp">False</span> | |
<span class="k">while</span> <span class="ow">not</span> <span class="n">success</span><span class="p">:</span> <span class="c"># foor loop in case your connection flickers</span> | |
<span class="k">try</span><span class="p">:</span> | |
<span class="n">r</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">url</span><span class="p">)</span> | |
<span class="n">success</span> <span class="o">=</span> <span class="bp">True</span> | |
<span class="k">except</span><span class="p">:</span> | |
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> | |
<span class="n">soup</span> <span class="o">=</span> <span class="n">BeautifulSoup</span><span class="p">(</span><span class="n">r</span><span class="o">.</span><span class="n">text</span><span class="p">)</span> | |
<span class="n">hrefs</span> <span class="o">=</span> <span class="n">soup</span><span class="o">.</span><span class="n">find_all</span><span class="p">(</span><span class="s">'a'</span><span class="p">)</span> | |
<span class="k">for</span> <span class="n">href</span> <span class="ow">in</span> <span class="n">hrefs</span><span class="p">:</span> | |
<span class="k">if</span> <span class="n">href</span><span class="o">.</span><span class="n">has_attr</span><span class="p">(</span><span class="s">'data-song'</span><span class="p">):</span> | |
<span class="n">song_id</span> <span class="o">=</span> <span class="n">href</span><span class="p">[</span><span class="s">'data-song'</span><span class="p">]</span> | |
<span class="k">print</span> <span class="n">song_id</span> | |
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> | |
<span class="n">success</span> <span class="o">=</span> <span class="bp">False</span> | |
<span class="k">while</span> <span class="ow">not</span> <span class="n">success</span><span class="p">:</span> | |
<span class="k">try</span><span class="p">:</span> | |
<span class="n">song_metadata</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">api</span> <span class="o">+</span> <span class="n">song_id</span><span class="p">)</span><span class="o">.</span><span class="n">json</span><span class="p">()</span> | |
<span class="n">success</span> <span class="o">=</span> <span class="bp">True</span> | |
<span class="k">except</span><span class="p">:</span> | |
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span> | |
<span class="k">if</span> <span class="s">'mus'</span> <span class="ow">in</span> <span class="n">song_metadata</span><span class="p">:</span> | |
<span class="k">if</span> <span class="s">'lang'</span> <span class="ow">in</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">]:</span> <span class="c"># discard if no language info</span> | |
<span class="n">language</span> <span class="o">=</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">][</span><span class="s">'lang'</span><span class="p">]</span> | |
<span class="k">if</span> <span class="n">language</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> <span class="c"># discard if language != Portuguese</span> | |
<span class="k">if</span> <span class="s">'text'</span> <span class="ow">in</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">]:</span> <span class="c"># discard if no lyrics</span> | |
<span class="n">artist2</span> <span class="o">=</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'art'</span><span class="p">][</span><span class="s">'name'</span><span class="p">]</span> | |
<span class="k">if</span> <span class="n">artist2</span> <span class="o">!=</span> <span class="n">artist1</span><span class="p">:</span> | |
<span class="k">if</span> <span class="n">counter</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> | |
<span class="k">print</span> <span class="n">artist1</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s">'utf-8'</span><span class="p">)</span> <span class="c"># change as needed</span> | |
<span class="n">genre_lyrics</span><span class="p">[</span><span class="n">genre</span><span class="p">][</span><span class="n">artist1</span><span class="p">]</span> <span class="o">=</span> <span class="n">artist_lyrics</span> | |
<span class="n">artist1</span> <span class="o">=</span> <span class="n">artist2</span> | |
<span class="n">artist_lyrics</span> <span class="o">=</span> <span class="p">[]</span> | |
<span class="n">lyrics</span> <span class="o">=</span> <span class="n">song_metadata</span><span class="p">[</span><span class="s">'mus'</span><span class="p">][</span><span class="mi">0</span><span class="p">][</span><span class="s">'text'</span><span class="p">]</span> | |
<span class="n">artist_lyrics</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">lyrics</span><span class="p">)</span> | |
<span class="n">counter</span> <span class="o">+=</span> <span class="mi">1</span> | |
<span class="k">print</span> <span class="s">'lyrics {} </span><span class="se">\r</span><span class="s">'</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">counter</span><span class="p">)</span> | |
<span class="c"># serialize</span> | |
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">genre</span> <span class="o">+</span> <span class="s">'.json'</span><span class="p">,</span> <span class="n">mode</span> <span class="o">=</span> <span class="s">'wb'</span><span class="p">)</span> <span class="k">as</span> <span class="n">fbuffer</span><span class="p">:</span> | |
<span class="n">json</span><span class="o">.</span><span class="n">dump</span><span class="p">(</span><span class="n">genre_lyrics</span><span class="p">[</span><span class="n">genre</span><span class="p">],</span> <span class="n">fbuffer</span><span class="p">)</span><span class="w"> | |
</span></pre></td></tr></tbody></table> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment