308 lines
9.8 KiB
Python
Executable File
308 lines
9.8 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
from bs4 import BeautifulSoup as bs
|
|
import csv
|
|
# from fuzzywuzzy import fuzz
|
|
import glob
|
|
from pprint import pprint
|
|
import re
|
|
import calendar
|
|
import phonenumbers
|
|
|
|
DIR_DATA = 'data/'
|
|
PATTERN_BIRTHDAY = re.compile(r'(?P<month>\w+?) (?P<day>\d+)([, ]+(?P<year>\d+)){0,}')
|
|
FIELDNAMES = ['Name','Given Name','Additional Name','Family Name','Yomi Name','Given Name Yomi','Additional Name Yomi','Family Name Yomi','Name Prefix','Name Suffix','Initials','Nickname','Short Name','Maiden Name','Birthday','Gender','Location','Billing Information','Directory Server','Mileage','Occupation','Hobby','Sensitivity','Priority','Subject','Notes','Language','Photo','Group Membership','E-mail 1 - Type','E-mail 1 - Value','IM 1 - Type','IM 1 - Service','IM 1 - Value','Website 1 - Type','Website 1 - Value','Phone 1 - Type','Phone 1 - Value']
|
|
|
|
|
|
list_files = glob.glob(DIR_DATA + 'about_*.html')
|
|
list_of_things = set()
|
|
|
|
with open('from_facebook.csv', 'w', newline='') as csvfile:
|
|
writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
|
|
writer.writeheader()
|
|
|
|
for file in list_files:
|
|
# print(file)
|
|
with open(file) as f:
|
|
file_data = f.read()
|
|
soup = bs(file_data, 'lxml')
|
|
|
|
# Look for files with incorrect content
|
|
match = (soup.title.text == "You Can't Use This Feature Right Now") or \
|
|
(soup.title.text == "Content Not Found") or \
|
|
(soup.title.text == "Error Facebook")
|
|
if match:
|
|
print(file, 'is wrong. Skipping.')
|
|
print()
|
|
continue
|
|
|
|
# Name
|
|
match = soup.select_one('div span div span strong')
|
|
# name_full = match.get_text()
|
|
name_full = ''.join(text for text in match.find_all(text=True) if text.parent.name != 'span')
|
|
|
|
# Alternate Name
|
|
match_1 = match.select_one('.alternate_name')
|
|
name_alternate = None
|
|
if match_1:
|
|
name_alternate = match_1.get_text()[1:-1]
|
|
|
|
# Birthday
|
|
birthday = None
|
|
birthday_google = None
|
|
match = soup.find(text='Birthday')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text()
|
|
if match_1:
|
|
match_2 = PATTERN_BIRTHDAY.search(match_1)
|
|
if match_2:
|
|
year = None
|
|
month = match_2['month']
|
|
day = match_2['day']
|
|
if match_2.group(3):
|
|
year = match_2['year']
|
|
num_month = format(list(calendar.month_name).index(month), '02d')
|
|
num_day = day.zfill(2)
|
|
if year:
|
|
birthday = year + num_month + num_day
|
|
birthday_google = num_month + '/' + num_day + '/' + year
|
|
else:
|
|
birthday = '--' + num_month + num_day
|
|
birthday_google = num_month + '/' + num_day
|
|
|
|
# Gender
|
|
gender = None
|
|
match = soup.find(text='Gender')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
gender = match_1
|
|
|
|
# Email
|
|
list_email = list()
|
|
matches = soup.find_all(text='Email')
|
|
if matches:
|
|
for match in matches:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
list_email.append(match_1)
|
|
|
|
# Adress
|
|
relationship = None
|
|
match = soup.find(text='Relationship')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
relationship = match_1
|
|
|
|
# Relationship
|
|
address = None
|
|
match = soup.find(text='Address')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text()
|
|
if match_1:
|
|
address = match_1
|
|
|
|
# facebook link
|
|
link_facebook = None
|
|
match = soup.find(text='Facebook')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
link_facebook = match_1
|
|
|
|
# instagram link
|
|
link_instagram = None
|
|
match = soup.find(text='Instagram')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
link_instagram = match_1
|
|
|
|
# YouTube link
|
|
link_youtube = None
|
|
match = soup.find(text='YouTube')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
link_youtube = match_1
|
|
|
|
# Twitter link
|
|
link_twitter = None
|
|
match = soup.find(text='Twitter')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
link_twitter = match_1
|
|
|
|
# Generic website link
|
|
list_link_websites = list()
|
|
matches = soup.find_all(text='Websites')
|
|
if matches:
|
|
for match in matches:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
list_link_websites.append(match_1)
|
|
|
|
# Other website link
|
|
link_other = None
|
|
match = soup.find(text='Other Service')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
link_other = match_1
|
|
|
|
# Tumblr link
|
|
link_tumblr = None
|
|
match = soup.find(text='Tumblr')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
link_tumblr = match_1
|
|
|
|
# Snapchat
|
|
social_snapchat = None
|
|
match = soup.find(text='Snapchat')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
social_snapchat = match_1
|
|
|
|
# eBuddy
|
|
social_ebuddy = None
|
|
match = soup.find(text='eBuddy')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
social_ebuddy = match_1
|
|
|
|
# LINE
|
|
social_line = None
|
|
match = soup.find(text='LINE')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
social_line = match_1
|
|
|
|
# Skype
|
|
social_skype = None
|
|
match = soup.find(text='Skype')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
social_skype = match_1
|
|
|
|
# Current City
|
|
city_current = None
|
|
match = soup.find(text='Current City')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
city_current = match_1
|
|
|
|
# Hometown
|
|
city_home = None
|
|
match = soup.find(text='Hometown')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
city_home = match_1
|
|
|
|
# Mobile
|
|
number_mobile = None
|
|
match = soup.find(text='Mobile')
|
|
if match:
|
|
match_1 = match.parent.parent.parent.next_sibling.get_text(strip=True)
|
|
if match_1:
|
|
number_parsed = phonenumbers.parse(match_1, 'IN')
|
|
number_rfc3966 = phonenumbers.format_number(number_parsed, phonenumbers.PhoneNumberFormat.RFC3966)
|
|
number_mobile = number_rfc3966
|
|
|
|
|
|
|
|
|
|
|
|
# # Print All
|
|
# print(name_full)
|
|
# if name_alternate:
|
|
# print(name_alternate)
|
|
# if birthday:
|
|
# print(birthday)
|
|
# if gender:
|
|
# print(gender)
|
|
# if list_email:
|
|
# for email in list_email:
|
|
# print(email)
|
|
# if relationship:
|
|
# print(relationship)
|
|
# if address:
|
|
# print(address)
|
|
# if link_facebook:
|
|
# print(link_facebook)
|
|
# if link_instagram:
|
|
# print(link_instagram)
|
|
# if link_youtube:
|
|
# print(link_youtube)
|
|
# if link_twitter:
|
|
# print(link_twitter)
|
|
# if list_link_websites:
|
|
# for link in list_link_websites:
|
|
# print(link)
|
|
# if link_other:
|
|
# print(link_other)
|
|
# if link_tumblr:
|
|
# print(link_tumblr)
|
|
# if social_snapchat:
|
|
# print(social_snapchat)
|
|
# if social_ebuddy:
|
|
# print(social_ebuddy)
|
|
# if social_line:
|
|
# print(social_line)
|
|
# if social_skype:
|
|
# print(social_skype)
|
|
# if city_current:
|
|
# print(city_current)
|
|
# if city_home:
|
|
# print(city_home)
|
|
# if number_mobile:
|
|
# print(number_mobile)
|
|
|
|
|
|
|
|
# Save to CSV
|
|
with open('from_facebook.csv', 'a', newline='') as csvfile:
|
|
writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
|
|
csv_dict = dict()
|
|
csv_dict['Name'] = name_full
|
|
if birthday:
|
|
csv_dict['Birthday'] = birthday
|
|
if gender:
|
|
csv_dict['Gender'] = gender
|
|
if list_email:
|
|
csv_dict['E-mail 1 - Value'] = email
|
|
if number_mobile:
|
|
csv_dict['Phone 1 - Value'] = number_mobile
|
|
writer.writerow(csv_dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# # Output Contact Info Types
|
|
# match = soup.select('#contact-info > div > div:nth-of-type(2) table tr td div span')
|
|
# print(name_full, match)
|
|
# for thing in match:
|
|
# list_of_things.add(thing.get_text())
|
|
# pprint(list_of_things)
|