Gibberish when writing news to html

Question

question

YanHan

1 ●0 ●0 ●1

Gibberish when writing news to html

Hello,

I tried to download daily oil Chinese news to html file.The code is as follows:

#OIL新闻下载

def download_headlines(query,S_date,E_date):

headlines = pd.DataFrame()

try:

while E_date > S_date:

headline = ek.get_news_headlines(query,date_to=E_date,count=100)

headlines = headlines.append(headline)

t = headline.iat[len(headline)-1,0]

E_date = t.strftime("%Y-%m-%dT%H:%M:%S.%f")

except:

print('出错了...')

finally:

headlines = headlines.drop_duplicates()

headlines = headlines.drop(headlines[headlines.versionCreated < S_date].index)

print('下载新闻标题总数：%d' % len(headlines))

return headlines

query_oil = '( Topic:CRU OR Topic:PROD ) AND Source:RTRS NOT ( Product:RITV OR Product:RCNBC OR Product:LCNBC ) in LZS'

End_date = datetime.datetime.now()

Start_date = End_date - timedelta(days=1)

headlines_oil = download_headlines(query_oil,str(Start_date),str(End_date))

headlines = headlines_oil

#下载全部新闻内容

headlines['story'] = [ek.get_news_story(each) for each in headlines['storyId'].tolist()]

#正则化

import re

headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"||</div>",'',x)) # 删除段落符

headlines['story'] = headlines['story'].apply(lambda x: re.sub(r" ",' ',x)) # 删除段落之间的过多空行

headlines['story'] = headlines['story'].apply(lambda x: re.sub(r" ",' ',x)) # 使段落之间强制空一行

headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|（]完(.*?)$",'【完】',x)) # 修改（完） -> 【完】

headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|（]c(.*?)Copyright(.*?)$",'',x)) # 删除“(c) Copyright”之后的内容

headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|（]([编译|发稿|整理|审校|Reporting|Editting])(.*?)$",'',x)) # 删除“（编译|发稿|Reporting”之后的内容

#转化为html文件

os.chdir(r'D:/TASK/Daily')

f = open('daily.html','w',encoding='utf-8') # 创建 news_archive.html

f = open('daily.html','a',encoding='utf-8') # 设置追加写入模式

f.writelines('目录<hr>')

table_of_content=headlines.text.tolist()

for each in table_of_content:

f.writelines(each + ' ')

f.writelines('<hr> ')

for i in range(len(headlines)):

f.writelines(''+ headlines.iat[i,1]+' ') #写入新闻标题，红色，空一行

f.write(headlines.iat[i,4]+'<hr> ') #写入新闻全文，文后加横线分隔

f.close()

However,though I've input encoding = 'utf-8' when writing html file,the output are still garbled words.Can you help me?

Thank you

eikon eikon-data-api python refinitiv-dataplatform-eikon workspace workspace-data-api

1614735243752.png (119.0 KiB)

Mar 03, 2021 at 01:35 AM

10 |1500

Attachments: Up to 2 attachments (including images) can be used with a maximum of 512.0 KiB each and 1.0 MiB total.

wasin.w Mar 11, 2021 at 06:36 AM

Hello @YanHan

Thank you for your participation in the forum. Is the reply below satisfactory in resolving your query?

If so please can you click the 'Accept' text next to the appropriate reply? This will guide all community members who have a similar question.

Thanks,

AHS

wasin.w Mar 25, 2021 at 07:44 AM

Please be informed that a reply has been verified as correct in answering the question, and has been marked as such.

Thanks,

AHS

Answer 1 · 2021-03-03T03:09:59Z

chavalit-jintamalit

18.2k ●21 ●13 ●21

Hi @YanHan

I just test this code and asked my colleague to confirm that the result are the same (no gibberish in the story news from API).

News from Eikon "News Monitor" app:

News from API:

news_id = 'urn:newsml:reuters.com:20210303:nL3S2L10QI:3'
df = ek.get_news_story(news_id)

from IPython.core.display import HTML
HTML(df)

ahs1.png (193.7 KiB)

ahs2.png (71.4 KiB)

Mar 03, 2021 at 03:09 AM

10 |1500

Attachments: Up to 2 attachments (including images) can be used with a maximum of 512.0 KiB each and 1.0 MiB total.

Answer 2 · 2021-03-03T03:26:52Z

moragodkrit.chumsri_1

7.6k ●15 ●6 ●9

@YanHan

I tested your codes on my jupyter notebook running on my windows 10 but did not find the garbled words like your screenshot. You can find the sample output from the attached daily.zip file.

daily.zip

And below are the codes I have modified a little bit.

#OIL新闻下载
import pandas as pd
def download_headlines(query,S_date,E_date):
    headlines = pd.DataFrame()
    try:
        while E_date > S_date:
            headline = ek.get_news_headlines(query,date_to=E_date,count=100)
            headlines = headlines.append(headline)
            t = headline.iat[len(headline)-1,0]
            E_date = t.strftime("%Y-%m-%dT%H:%M:%S.%f")
    except:
        print('出错了...')
    finally:
        headlines = headlines.drop_duplicates()
        headlines = headlines.drop(headlines[headlines.versionCreated < S_date].index)
        print('下载新闻标题总数：%d' % len(headlines))
    return headlines
import datetime
query_oil = '( Topic:CRU OR Topic:PROD ) AND Source:RTRS NOT ( Product:RITV OR Product:RCNBC OR Product:LCNBC ) in LZS'
End_date = datetime.datetime.now()
Start_date = End_date - datetime.timedelta(days=1)
headlines_oil = download_headlines(query_oil,str(Start_date),str(End_date))
headlines = headlines_oil
#下载全部新闻内容
headlines['story'] = [ek.get_news_story(each) for each in headlines['storyId'].tolist()]
#正则化
import re


headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"</p>|<p>|</div>",'',x)) # 删除段落符
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"<br/><br/>",'<br/>',x)) # 删除段落之间的过多空行
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"<br/>",'<br/><br/>',x)) # 使段落之间强制空一行
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|（]完(.*?)$",'【完】',x)) # 修改 （完） -> 【完】
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|（]c(.*?)Copyright(.*?)$",'',x)) # 删除“(c) Copyright”之后的内容
headlines['story'] = headlines['story'].apply(lambda x: re.sub(r"[(|（]([编译|发稿|整理|审校|Reporting|Editting])(.*?)$",'',x)) # 删除“（编译|发稿|Reporting”之后的内容


#转化为html文件
import os
os.chdir(r'c:\\tmp')
#f = open('daily.html','w',encoding='utf-8') # 创建 news_archive.html
f = open('daily.html','a+',encoding='utf-8') # 设置追加写入模式
f.writelines('目录<hr>')
table_of_content=headlines.text.tolist()
for each in table_of_content:
    f.writelines(each + '<br/>')
    f.writelines('<hr><br/>')

for i in range(len(headlines)):
    f.writelines('<font color="#FF0000">'+ headlines.iat[i,1]+'</font><br/><br/>') #写入新闻标题，红色，空一行
    f.write(headlines.iat[i,4]+'<hr><br/>') #写入新闻全文，文后加横线分隔
f.close()

daily.zip (5.8 KiB)

Mar 03, 2021 at 03:26 AM

10 |1500

Attachments: Up to 2 attachments (including images) can be used with a maximum of 512.0 KiB each and 1.0 MiB total.

Q&A Forum

question

Gibberish when writing news to html

2 Answers

Write an Answer

question

Gibberish when writing news to html

2 Answers

Write an Answer

Related Questions