Scraping Data from LinkedIn

Normal methods are not workable on LinkedIn, which manages to block many forms of scrape bot. I used Selenium, a framework used for software testing by emulating a user using a browser.

Import dependencies

In [2]:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from random import randint
from bs4 import BeautifulSoup
import pandas as pd
import time
from openpyxl import load_workbook

Profiles to scrape

Get the list of names to search for. Names are processed to remove “bin”, “a/l” etc.

In [17]:

# load data
profiles = pd.read_excel('../example.xlsx', sheet_name='Sheet1', usecols=[0])
profiles['Name'] = profiles['Name'].dropna().apply(lambda x: x.lower().replace('/','').replace(
    ' binti ',' bin ').replace(' ap ',' bin ').replace(' al ',' bin ').replace(' bt ',' bin '))

# separate names
new = profiles['Name'].str.split(' bin ', n = 1, expand = True)
profiles['First Name']= new[0]
profiles['Last Name']= new[1]
profiles['Full Name'] = profiles['First Name'] + " " + profiles['Last Name']
profiles['Full Name'].fillna(value=profiles['Name'], inplace=True)

# profiles.reset_index(inplace=True)
profiles

	Name	First Name	Last Name	Full Name
0	shaw kok hao	shaw kok hao	None	shaw kok hao
1	arumugam bin pillay	arumugam	pillay	arumugam pillay
2	pravin	pravin	None	pravin
3	tan pei ling	tan pei ling	None	tan pei ling
4	shaw kok hao	shaw kok hao	None	shaw kok hao
5	ashraf bin asli	ashraf	asli	ashraf asli

Methods

In [4]:

# fetch page html source
def fetch_html(driver):
    return driver.page_source

# parse html source to beautifulsoup
def parse_html(driver, html):
    return BeautifulSoup(html,'lxml')


# extract info from html using beautifulsoup
def extract_basic_info(soup, url):
    
    if soup.find('h1', {'class':'pv-top-card-section__name inline t-24 t-black t-normal'}):
        name = soup.find('h1', {'class':'pv-top-card-section__name inline t-24 t-black t-normal'}).getText().strip()
    else:
        name = None
        url = None
        
    if soup.find('h2',{'class':'pv-top-card-section__headline mt1 t-18 t-black t-normal'}):
        self_intro1 = soup.find('h2',{'class':'pv-top-card-section__headline mt1 t-18 t-black t-normal'}).getText().strip()
    else: self_intro1 = None
    
    if soup.find('h3', {'class':'pv-entity__school-name t-16 t-black t-bold'}):
        school = soup.find('h3', {'class':'pv-entity__school-name t-16 t-black t-bold'}).getText()
    else: school = None
        
    if soup.find('span', {'class':'pv-entity__comma-item'}):
        degree = soup.find('span', {'class':'pv-entity__comma-item'}).getText()
    else: degree = None

    if soup.find('span',{'class':'pv-entity__secondary-title'}):
        current_company = soup.find('span',{'class':'pv-entity__secondary-title'}).getText()
    else: current_company = None
        
    if soup.find('h3',{'class':'t-16 t-black t-bold'}):
        current_title = soup.find('h3',{'class':'t-16 t-black t-bold'}).getText()
    else: current_title = None
    
    if soup.find('p',{'class':'pv-entity__description t-14 t-black t-normal ember-view'}):
        desc = soup.find('p',{'class':'pv-entity__description t-14 t-black t-normal ember-view'}).getText()
    else: desc = None
        
    return {"url": [url],
            "name": [name],
            "school": [school],
            "degree": [degree],
            "company": [current_company],
            "title": [current_title],
            "desc": [desc]}

# get the number of results per page
def search_num(driver):
    lst_Count= driver.find_elements_by_xpath("//div[@class='search-result__wrapper']")
    return len(lst_Count)

# search for name in linkedin
def search(name, driver):
    searchElem = driver.find_element_by_xpath("//div[@id='nav-typeahead-wormhole']//input[1]")
    searchElem.clear()
    searchElem.send_keys(name)

    searchElem.send_keys(Keys.ENTER);
    lst_Count= driver.find_elements_by_xpath("//div[@class='search-result__wrapper']")
    
    if len(lst_Count) == 1:
        search = driver.find_element_by_css_selector('h3.actor-name-with-distance')
        search.click()
#     else:
#         print("Lot's of searches")
    return

# open new tab
def new_tab(name):
    print('searching ' + name + "'s profile")
    url_link = "https://www.linkedin.com/search/results/all/?keywords=" + name + "&origin=GLOBAL_SEARCH_HEADER"
    driver.execute_script("window.open('about:blank', 'tab2');")
    return
    
# open search in new tab
def search_new_tab(name):
    time.sleep(randint(6,20))
    print('searching ' + name + "'s profile")
    url_link = "https://www.linkedin.com/search/results/all/?keywords=" + name + "&origin=GLOBAL_SEARCH_HEADER"
    
    driver.execute_script("window.open('about:blank');")
    window_list = driver.window_handles
    driver.switch_to.window(window_list[-1])
    driver.get(url_link)

    lst_Count= driver.find_elements_by_xpath("//div[@class='search-result__wrapper']")
    if len(lst_Count) == 1:
        time.sleep(randint(0,5))
        search = driver.find_element_by_css_selector('h3.actor-name-with-distance')
        search.click()
    else: scroll()
    return
        
        
# scroll the page
def scroll():
    num = randint(0,2)
    if num == 1:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(randint(3,5))
        num2 = randint(0,1)
        if num2 == 0:
            driver.execute_script("window.scrollTo(0, 0);")
    return

# close all tabs
def close_tab():
    current_window = driver.current_window_handle
    new_window = [window for window in driver.window_handles if window != current_window]
    for i in range(len(new_window)):
        new_window = [window for window in driver.window_handles if window != current_window][0]
        driver.switch_to.window(new_window)
        driver.close()
    driver.switch_to.window(current_window)
    driver.get("https://www.linkedin.com/feed/")
    return

# output collected data to excel
def write_to_excel():
    book = load_workbook(path)
    writer = pd.ExcelWriter(path, engine = 'openpyxl')
    writer.book = book

    df.to_excel(writer, sheet_name=str(x))
    writer.save()
    writer.close()
    return

Main

Logging in to LinkedIn automated by Selenium

In [21]:

driver = webdriver.Chrome()
driver.get('http://linkedin.com')

## Logging in to LinkedIn
emailElem = driver.find_element_by_id('login-email')
emailElem.send_keys('@mail.com')

passwordElem = driver.find_element_by_id('login-password')
passwordElem.send_keys('password')
passwordElem.submit()

In [5]:

path = r"managers.xlsx"
x = 29

Since I have a big list to search, I decided to divide the name to batches of y. x means the nth batch.

In [42]:

start_time = time.time()

y = 20

i = x*y 
j = i + y

if j > len(profiles.index):
    print('''
    ================
    more than enough
    ================
              ''')


search = profiles[i:j]
for index, row in search.iterrows():
    print(row['index'], end=' ')
    search_new_tab(row['Full Name'])
    
print("--- %s seconds ---" % (time.time() - start_time))
print("%i%% done" % (j/len(profiles.index) * 100))

    ================
    more than enough
    ================
              
searching jim daryl teo jin liang's profile
searching mohd najib mohammad's profile
searching nik zurin nik mohamed's profile
searching mohamed bachik mohamed hussain's profile
searching chua kiow kiow's profile
searching nicky lim hui siang's profile
searching woo choon kong's profile
searching abd wahid idris's profile
searching muniandy maruthiah's profile
searching choi vincent's profile
searching michael loh tet fook's profile
searching see beng keh's profile
--- 193.57338523864746 seconds ---
101% done

Collect data

Collect HTML from all tabs and append to data frame. Write to excel new sheet.

In [None]:

window_list = driver.window_handles
df = pd.DataFrame()

for i in range(len(window_list)):
    driver.switch_to.window(window_list[i])
#     time.sleep(randint(0,5))
#     scroll()
    
    html = driver.page_source
    soup = BeautifulSoup(html,'lxml')
    url = driver.current_url

    dict_profile = extract_basic_info(soup, url)
    df2 = pd.DataFrame.from_dict(dict_profile)
    df = df.append(df2, ignore_index=True)
    
df.head()
write_to_excel()

Check new sheet is added.

In [40]:

xls = pd.ExcelFile(path, on_demand = True)
sheets = xls.sheet_names
print(sheets)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33']

Close all tabs.

In [41]:

close_tab()
x += 1
print(x)

Share on

Twitter Facebook LinkedIn

Xun Wei

Scraping Data from LinkedIn

Import dependencies

Profiles to scrape

Methods

Main

Collect data

Share on

You may also enjoy

Debug Python Module on VS Code from Command Line

Docker with Conda and Jupyter

Tmux

Using Spyder instead of Jupyter