如何使用beatifulsoup从linkedin抓取公开可用的数据

时间:2017-12-05 10:51:18

标签: python web-scraping beautifulsoup linkedin

我试图使用beautifulsoup从linkedin抓取公开可用的主题。但是在试图废弃我时,我突然得到免费注册以获得更多弹出窗口并且无法获取任何数据。有人可以帮我这个。

以下是我为获取主题而编写的代码:

from __future__ import print_function
import requests
from bs4 import BeautifulSoup
import operator
import os
import sys

topics = []

def get_main_topics(url):
    source_code = requests.get(url).text
    soup = BeautifulSoup(source_code, 'html.parser')
    for main_topics in soup.find_all('ul', {'class' : 'quad-column'}):
        for sub_links in main_topics.find_all('a'):
            topic = sub_links.string
            print(topic)
            topics.append(topic)

def get_topics(url, alphabet):
    print('Inside get Topics')
    main_url = str(url) + str(alphabet) + '/'
    if alphabet != 'y' or alphabet != 'z':
        source_code = requests.get(main_url).text
        soup = BeautifulSoup(source_code, 'html.parser')
        for main_topics in soup.find_all('ul', {'class' : 'quad-column'})[1]:
            for sub_links in main_topics.find_all('a'):
                sub_url = sub_links.get('href')
                get_main_topics(sub_url)
    else:
        source_code = requests.get(main_url).text
        soup = BeautifulSoup(source_code, 'html.parser')
        for main_topics in soup.find_all('ul', {'class' : 'quad-column'}):
            for sub_links in main_topics.find_all('a'):
                topic = sub_links.string
                print(topic)
                topics.append(topic)

def start():
    print('Started Crawling...')
    alphabets = ('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z')
    for alphabet in alphabets:
        get_topics('https://www.linkedin.com/directory/topics-', alphabet)
    fw = open('linkedin_topics.txt', 'w')
    for i in topics:
        try:
            fw.write(i+'\n')
        except:
            print(i)
            continue
    print('Crawling Completed and updated...')

检查beautifulsoup的输出我得到了这个..

<html><head>
<script type="text/javascript">
window.onload = function() {
  // Parse the tracking code from cookies.
  var trk = "bf";
  var trkInfo = "bf";
  var cookies = document.cookie.split("; ");
  for (var i = 0; i < cookies.length; ++i) {
    if ((cookies[i].indexOf("trkCode=") == 0) && (cookies[i].length > 8)) {
      trk = cookies[i].substring(8);
    }
    else if ((cookies[i].indexOf("trkInfo=") == 0) && (cookies[i].length > 8)) {
      trkInfo = cookies[i].substring(8);
    }
  }

  if (window.location.protocol == "http:") {
    // If "sl" cookie is set, redirect to https.
    for (var i = 0; i < cookies.length; ++i) {
      if ((cookies[i].indexOf("sl=") == 0) && (cookies[i].length > 3)) {
        window.location.href = "https:" + window.location.href.substring(window.location.protocol.length);
        return;
      }
    }
  }

  // Get the new domain. For international domains such as
  // fr.linkedin.com, we convert it to www.linkedin.com
  var domain = "www.linkedin.com";
  if (domain != location.host) {
    var subdomainIndex = location.host.indexOf(".linkedin");
    if (subdomainIndex != -1) {
      domain = "www" + location.host.substring(subdomainIndex);
    }
  }

  window.location.href = "https://" + domain + "/authwall?trk=" + trk + "&trkInfo=" + trkInfo +
      "&originalReferer=" + document.referrer.substr(0, 200) +
      "&sessionRedirect=" + encodeURIComponent(window.location.href);
}
</script>
</head></html>

0 个答案:

没有答案