craw_headline_news_from_bloomberg.py
4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 16 21:57:57 2017
@author: red-sky
"""
import bs4
import json
import sys
import urllib.request as urlreq
from bs4 import BeautifulSoup
import requests
BLOOMBERG_params = {
"sort_by_newest": "time:desc",
"sort_by_oldest": "time:asc",
"source_from_bloomberg": "sites=bview",
"end_time": "2017-03-12T15:20:16.240Z"
}
DATA_TO_EXTRACT = {
"query_list_news": ["div", {"class": "search-result-story__container"}],
"query_headline": ["h1", {"class": "search-result-story__headline"}],
"query_time_published": ["time", {"class": "published-at"}],
"query_body": ["div", {"class": "search-result-story__body"}]
}
def parser_url(query_string, page,
sort_by="sort_by_oldest",
source="source_from_bloomberg"):
url = "https://www.bloomberg.com/"
# add search query
url = url + "search?query=" + query_string + "&"
# add sort by
url = url + "sort=" + BLOOMBERG_params[sort_by] + "&"
# add time to query -- use present time
url = url + "sites=" + BLOOMBERG_params[source] + "&"
# add page number
url = url + "page=" + str(page)
return url
def get_rid_off_key(list_contents):
body_string = ""
for substring in list_contents:
if (type(substring) == bs4.element.Tag):
# join all body string and
# eliminate highlight query string key
body_string += substring.string
else:
if (type(substring.string) == bs4.element.NavigableString):
body_string += substring.string
return(body_string)
def extract_from_url(url):
try:
with requests.get(url) as response:
html_of_page = response.read()
soup_object = BeautifulSoup(html_of_page, "lxml")
# Extract list of news in soup object
param_to_find = DATA_TO_EXTRACT["query_list_news"]
list_of_news = soup_object.find_all(param_to_find[0],
attrs=param_to_find[1])
if (len(list_of_news) == 0):
return None
# create list result extracted
result = []
for block_new in list_of_news:
# extract time from block
param_to_find = DATA_TO_EXTRACT["query_time_published"]
time = block_new.find_all(param_to_find[0],
attrs=param_to_find[1])
time = time[0]["datetime"]
# extract new headline
param_to_find = DATA_TO_EXTRACT["query_headline"]
headline = block_new.find_all(param_to_find[0],
attrs=param_to_find[1])
headline = get_rid_off_key(headline[0].a.contents)
# extract new body list if string
param_to_find = DATA_TO_EXTRACT["query_body"]
body = block_new.find_all(param_to_find[0],
attrs=param_to_find[1])
print(body)
body_string = get_rid_off_key(body[0].contents)
extracted_from_block = {"time": time,
"headline": headline,
"body": body_string}
# for debug :
# print("\t".join(extracted_from_block))
if len(body_string) >= 5:
result.append(extracted_from_block)
except Exception as inst:
print("Something whenwrong :)", inst)
print("ULR: ", url)
result = []
return(result)
def Query(key, max_page=5000):
# Init page and looping until return None
page = 1
result = "not None"
all_result_query = []
error = 0
while True and page < max_page:
print("Colected: %d articles" % len(all_result_query))
new_url = parser_url(key, page)
result = extract_from_url(new_url)
if len(result) > 0 or error > 10:
page += 1
error = 0
else:
error += 1
if result is not None:
all_result_query += result
else:
break
return(all_result_query)
if __name__ == "__main__":
print("Begin query information about: ", sys.argv[1])
print("Then will save result in: ", sys.argv[2])
News = Query(sys.argv[1], int(sys.argv[4]))
file_name1 = sys.argv[2]
with open(file_name1, "w") as W:
json.dump(News, W, indent=1)
file_name2 = sys.argv[3]
with open(file_name2, "w") as W:
W.write("\n".join([new["body"] for new in News]))