Dataset of the LinkedIn profiles of 1,595 people who self-identified on the site as working for ICE. https://archive.is/20180621131016/https://medium.com/@samlavigne/downloading-the-profiles-of-everyone-on-linkedin-who-works-for-ice-c4e0ff6b065e
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

224 lines
5.8 KiB

import time
import json
import csv
import os
import requests
from bs4 import BeautifulSoup
from jinja2 import Template
import headers
# these represent different job functions
FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA
SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE
LOCATION_FACETS = [ #G
'us:8-2-0-1-2',
'us:97',
'us:va',
'us:dc',
'us:tx',
'us:ca',
'us:md',
'us:70',
'us:31',
'us:ny',
'us:8-8-0-8-1',
'us:8-8-0-3-1',
'us:ga',
'us:52',
'us:7',
'us:8-8-0-95-11',
'us:nj',
'us:3-2-0-31-1',
]
FACETS = [
('FA', FUNCTION_FACETS),
('SE', SENIORITY_FACETS),
('G', LOCATION_FACETS)
]
def download_file(url, local_filename=None):
'''Downloads a file with requests
from: https://stackoverflow.com/a/16696317
'''
if local_filename is None:
local_filename = url.split('/')[-1]
print('saving to', local_filename)
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
return local_filename
def get_page(company_id, facet=None, facet_id=None, start=0, count=50):
'''Gets a single page of results from linkedin for a particular job function at a company'''
params = {
'facet': ['CC'],
'facet.CC': company_id,
'count': count,
'start': start,
}
if facet is not None and facet_id is not None:
params['facet'] = ['CC', facet]
params['facet.' + facet] = facet_id
response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
return response.json()
def get_company(company_id, outname):
'''Gets all employees from a company using particular job functions'''
people = []
for facet, facet_ids in FACETS:
for facet_id in facet_ids:
print('getting facet', facet, facet_id, 'for company', company_id)
count = 50
start = 0
results = get_page(company_id, facet, facet_id)
total = results['pagination']['total']
people += results['searchResults']
start += count
while start < total:
print('getting', start, 'of', total)
time.sleep(1)
results = get_page(company_id, facet, facet_id, start)
people += results['searchResults']
start += count
with open(outname, 'w') as outfile:
json.dump(people, outfile, indent=2)
return outname
def get_images(datafile):
'''Downloads profile images'''
with open(datafile, 'r') as infile:
people = json.load(infile)
people = [p['member'] for p in people]
for p in people:
if 'vectorImage' not in p:
continue
pid = p['memberId']
outname = 'images/{}.jpg'.format(pid)
if os.path.exists(outname):
print('skipping')
continue
url = p['vectorImage']['rootUrl']
url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
print(url)
download_file(url, outname)
time.sleep(1)
def get_profile(pid):
'''Downloads individual profiles'''
outname = 'profiles/{}.json'.format(pid)
if os.path.exists(outname):
return outname
out = {}
url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
print(url)
response = requests.get(url, headers=headers.headers)
soup = BeautifulSoup(response.text, 'html.parser')
codes = soup.select('code')
for c in codes:
try:
d = json.loads(c.text)
if 'contactInfo' in d:
out = d
break
except Exception as e:
continue
with open(outname, 'w') as outfile:
json.dump(out, outfile)
time.sleep(1)
return outname
def get_profiles(datafile):
'''Gets all profiles'''
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
pid = d['member']['profileId']
get_profile(pid)
def clean_and_parse(datafile, outname):
'''Outputs csv, json and html from employee listings'''
out = []
mids = []
with open(datafile, 'r') as infile:
data = json.load(infile)
for d in data:
mid = d['member']['memberId']
pid = d['member']['profileId']
imgpath = 'images/{}.jpg'.format(mid)
if not os.path.exists(imgpath):
imgpath = None
item = {
'name': d['member'].get('formattedName', ''),
'title': d['member'].get('title', ''),
'img': imgpath,
'company': d['company'].get('companyName', ''),
'location': d['member'].get('location', ''),
'id': d['member']['memberId'],
'linkedin': 'https://linkedin.com/in/' + pid,
}
if mid not in mids:
out.append(item)
mids.append(mid)
with open(outname + '.json', 'w') as jsonfile:
json.dump(out, jsonfile, indent=2)
with open(outname + '.csv', 'w') as csvfile:
fieldnames = list(out[0].keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in out:
writer.writerow(row)
with open('template.html', 'r') as templatefile:
template = Template(templatefile.read())
html = template.render(people=out)
with open('index.html', 'w') as htmlout:
htmlout.write(html)
if __name__ == '__main__':
ICE = '533534'
datafile = 'ice_raw.json'
get_company(ICE, datafile)
get_profiles(datafile)
get_images(datafile)
clean_and_parse(datafile, 'ice')