-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBlock.py
More file actions
98 lines (73 loc) · 2.78 KB
/
Block.py
File metadata and controls
98 lines (73 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import feedparser as f
import lxml.html as html
import requests
import urllib.request
class Lenta():
link = "https://lenta.ru/rss"
def news (self, num):
d = f.parse(self.link)
try:
if d.bozo:
raise d.bozo_exception
finally:
data = [{"title":d.entries[i].title, "link":d.entries[i].link,
"desc":d.entries[i].description,
"published":d.entries[i].published} for i in range(1,num+1)]
return data
def grub (self, link, title = "None", img = "None", content = "None"):
lentaContent = []
page = requests.get(link)
charset = urllib.request.urlopen(link).headers.get_content_charset()
try:
tree = html.fromstring(page.content.decode(charset))
except:
tree = html.fromstring(page.content.decode('windows-1251'))
for div in tree.cssselect('div.b-text p'):
lentaContent.append(div.text_content())
lentaTitle = tree.cssselect('h1.b-topic__title')[0].text
try:
lentaImage = tree.cssselect('div.b-topic__title-image img.g-picture')[0].get("src")
except:
lentaImage = 'None'
lentaArticle = {"title": lentaTitle,
"image": lentaImage,
"content": lentaContent}
return lentaArticle
class Parser():
def __init__(self, link):
self.link = link
def news (self, num):
d = f.parse(self.link)
try:
if d.bozo:
raise d.bozo_exception
finally:
data = [{"title":d.entries[i].title, "link":d.entries[i].link,
"desc":d.entries[i].description,
"published":d.entries[i].published} for i in range(1,num+1)]
return data
def grub (self, link, title = "None", img = "None", cont = "None"):
Content = []
page = requests.get(link)
charset = urllib.request.urlopen(link).headers.get_content_charset()
try:
tree = html.fromstring(page.content.decode(charset))
except:
tree = html.fromstring(page.content.decode('windows-1251'))
if cont != "None":
for div in tree.cssselect(cont):
Content.append(div.text_content())
else: Content.append(cont)
try:
if img != "None":
Image = tree.cssselect(img)[0].get("src")
else: Image = img
except:
Image = "None"
if title != "None":
Title = tree.cssselect(title)[0].text
else: Title = title
Article = {"title": Title,
"image": Image,
"content": Content}
return Article