Dataset of the LinkedIn profiles of 1,595 people who self-identified on the site as working for ICE. https://archive.is/20180621131016/https://medium.com/@samlavigne/downloading-the-profiles-of-everyone-on-linkedin-who-works-for-ice-c4e0ff6b065e
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

225 lines
5.8 KiB

  1. import time
  2. import json
  3. import csv
  4. import os
  5. import requests
  6. from bs4 import BeautifulSoup
  7. from jinja2 import Template
  8. import headers
  9. # these represent different job functions
  10. FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA
  11. SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE
  12. LOCATION_FACETS = [ #G
  13. 'us:8-2-0-1-2',
  14. 'us:97',
  15. 'us:va',
  16. 'us:dc',
  17. 'us:tx',
  18. 'us:ca',
  19. 'us:md',
  20. 'us:70',
  21. 'us:31',
  22. 'us:ny',
  23. 'us:8-8-0-8-1',
  24. 'us:8-8-0-3-1',
  25. 'us:ga',
  26. 'us:52',
  27. 'us:7',
  28. 'us:8-8-0-95-11',
  29. 'us:nj',
  30. 'us:3-2-0-31-1',
  31. ]
  32. FACETS = [
  33. ('FA', FUNCTION_FACETS),
  34. ('SE', SENIORITY_FACETS),
  35. ('G', LOCATION_FACETS)
  36. ]
  37. def download_file(url, local_filename=None):
  38. '''Downloads a file with requests
  39. from: https://stackoverflow.com/a/16696317
  40. '''
  41. if local_filename is None:
  42. local_filename = url.split('/')[-1]
  43. print('saving to', local_filename)
  44. r = requests.get(url, stream=True)
  45. with open(local_filename, 'wb') as f:
  46. for chunk in r.iter_content(chunk_size=1024):
  47. if chunk:
  48. f.write(chunk)
  49. return local_filename
  50. def get_page(company_id, facet=None, facet_id=None, start=0, count=50):
  51. '''Gets a single page of results from linkedin for a particular job function at a company'''
  52. params = {
  53. 'facet': ['CC'],
  54. 'facet.CC': company_id,
  55. 'count': count,
  56. 'start': start,
  57. }
  58. if facet is not None and facet_id is not None:
  59. params['facet'] = ['CC', facet]
  60. params['facet.' + facet] = facet_id
  61. response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
  62. return response.json()
  63. def get_company(company_id, outname):
  64. '''Gets all employees from a company using particular job functions'''
  65. people = []
  66. for facet, facet_ids in FACETS:
  67. for facet_id in facet_ids:
  68. print('getting facet', facet, facet_id, 'for company', company_id)
  69. count = 50
  70. start = 0
  71. results = get_page(company_id, facet, facet_id)
  72. total = results['pagination']['total']
  73. people += results['searchResults']
  74. start += count
  75. while start < total:
  76. print('getting', start, 'of', total)
  77. time.sleep(1)
  78. results = get_page(company_id, facet, facet_id, start)
  79. people += results['searchResults']
  80. start += count
  81. with open(outname, 'w') as outfile:
  82. json.dump(people, outfile, indent=2)
  83. return outname
  84. def get_images(datafile):
  85. '''Downloads profile images'''
  86. with open(datafile, 'r') as infile:
  87. people = json.load(infile)
  88. people = [p['member'] for p in people]
  89. for p in people:
  90. if 'vectorImage' not in p:
  91. continue
  92. pid = p['memberId']
  93. outname = 'images/{}.jpg'.format(pid)
  94. if os.path.exists(outname):
  95. print('skipping')
  96. continue
  97. url = p['vectorImage']['rootUrl']
  98. url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']
  99. print(url)
  100. download_file(url, outname)
  101. time.sleep(1)
  102. def get_profile(pid):
  103. '''Downloads individual profiles'''
  104. outname = 'profiles/{}.json'.format(pid)
  105. if os.path.exists(outname):
  106. return outname
  107. out = {}
  108. url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
  109. print(url)
  110. response = requests.get(url, headers=headers.headers)
  111. soup = BeautifulSoup(response.text, 'html.parser')
  112. codes = soup.select('code')
  113. for c in codes:
  114. try:
  115. d = json.loads(c.text)
  116. if 'contactInfo' in d:
  117. out = d
  118. break
  119. except Exception as e:
  120. continue
  121. with open(outname, 'w') as outfile:
  122. json.dump(out, outfile)
  123. time.sleep(1)
  124. return outname
  125. def get_profiles(datafile):
  126. '''Gets all profiles'''
  127. with open(datafile, 'r') as infile:
  128. data = json.load(infile)
  129. for d in data:
  130. pid = d['member']['profileId']
  131. get_profile(pid)
  132. def clean_and_parse(datafile, outname):
  133. '''Outputs csv, json and html from employee listings'''
  134. out = []
  135. mids = []
  136. with open(datafile, 'r') as infile:
  137. data = json.load(infile)
  138. for d in data:
  139. mid = d['member']['memberId']
  140. pid = d['member']['profileId']
  141. imgpath = 'images/{}.jpg'.format(mid)
  142. if not os.path.exists(imgpath):
  143. imgpath = None
  144. item = {
  145. 'name': d['member'].get('formattedName', ''),
  146. 'title': d['member'].get('title', ''),
  147. 'img': imgpath,
  148. 'company': d['company'].get('companyName', ''),
  149. 'location': d['member'].get('location', ''),
  150. 'id': d['member']['memberId'],
  151. 'linkedin': 'https://linkedin.com/in/' + pid,
  152. }
  153. if mid not in mids:
  154. out.append(item)
  155. mids.append(mid)
  156. with open(outname + '.json', 'w') as jsonfile:
  157. json.dump(out, jsonfile, indent=2)
  158. with open(outname + '.csv', 'w') as csvfile:
  159. fieldnames = list(out[0].keys())
  160. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  161. writer.writeheader()
  162. for row in out:
  163. writer.writerow(row)
  164. with open('template.html', 'r') as templatefile:
  165. template = Template(templatefile.read())
  166. html = template.render(people=out)
  167. with open('index.html', 'w') as htmlout:
  168. htmlout.write(html)
  169. if __name__ == '__main__':
  170. ICE = '533534'
  171. datafile = 'ice_raw.json'
  172. get_company(ICE, datafile)
  173. get_profiles(datafile)
  174. get_images(datafile)
  175. clean_and_parse(datafile, 'ice')