1 前言

2 网络爬虫

2.1 构造自己的Scrapy爬虫

2.1.1 items.py

2.1.2 spiders子目录

# encoding: utf-8
import scrapy
import re
from scrapy.selector import Selector
from stock163.items import Stock163Item
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class ExampleSpider(CrawlSpider):
    name = "stocknews" #爬虫名字为 "stocknews"
    allowed_domains = ["money.163.com"]#设置允许爬取域名

    def __init__(self, id='600000', page='0', *args, **kwargs):#初始化方法设置了一些初始参数,包括 id默认为 '600000')、page默认为 '0'),以及其他可能传递参数
        # allowrule = "/%s/%sd+/d+/*" % (year, month)
        # allowrule = "/%s/%s%s/d+/*" % (year, month, day) #这个规则匹配类似 "/2022/11/25/" 这样的日期结构
        allowrule = r"/d+/d+/d+/*"# 定义一个正则表达式用于匹配新闻链接规则数字 数字 数字 任意字符
        self.counter = 0 # 初始化一个计数器可能用于跟踪爬取新闻数量。
        self.stock_id = id  # 保存股票ID
        self.start_urls = ['http://quotes.money.163.com/f10/gsxw_%s,%s.html' % (id, page)] # 设置初始爬取的URL,这里使用了 id 和 page 参数构造URL。
        ExampleSpider.rules = (Rule(LinkExtractor(allow=allowrule), callback="parse_news", follow=False),)
        # 定义爬取规则这里使用了 LinkExtractor提取链接通过正则表达式 allow=allowrule 匹配链接规则然后指定回调函数parse_news
        # 最后设置 follow=False 表示跟踪当前链接提取链接
        # recompile the rule
        super(ExampleSpider, self).__init__(*args, **kwargs)
        # 调用父类(CrawlSpider)的初始化方法,确保爬虫正确初始化

    '''
    rules=Rule(LinkExtractor(allow=r"/d+/d+/d+/*"),
               callback="parse_news", follow=True
    )
    '''

    # f = open("out.txt", "w")

    def printcn(suni):
        for i in suni:
            print(suni.encode('utf-8'))

    def parse_news(self, response):
        item = Stock163Item()
        item['news_thread'] = response.url.strip().split('/')[-1][:-5]
        #这行代码响应的URL中提取新闻线程信息。它首先通过response.url获取当前页面的URL,然后使用strip()方法去除首尾空格,接着使用split('/')方法根据斜杠切割URL为一个列表最后通过[-1]
        #取列表最后一个元素,即URL中最后一个斜杠后的部分。[: -5] 是为了去掉文件扩展名假设是.html或类似的扩展名),剩下的部分就是新闻线程信息然后将其赋值item对象news_thread属性
        self.get_thread(response,item)
        self.get_title(response, item)
        self.get_source(response, item)
        self.get_url(response, item)
        self.get_news_from(response, item)
        self.get_from_url(response, item)
        self.get_text(response, item)

        return item  ##############!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse

    def get_title(self, response, item):
        title = response.xpath("/html/head/title/text()").extract()
        if title:
            # print ('title:'+title[0][:-5].encode('utf-8'))
            item['news_title'] = title[0][:-5]

    def get_source(self, response, item):
        source = response.xpath("//div[@class='left']/text()").extract()
        if source:
            # print ('source'+source[0][:-5].encode('utf-8'))
            item['news_time'] = source[0][:-5]

    def get_news_from(self, response, item):
        news_from = response.xpath("//div[@class='left']/a/text()").extract()
        if news_from:
            # print 'from'+news_from[0].encode('utf-8')
            item['news_from'] = news_from[0]

    def get_from_url(self, response, item):
        from_url = response.xpath("//div[@class='left']/a/@href").extract()
        if from_url:
            # print ('url'+from_url[0].encode('utf-8')        )
            item['from_url'] = from_url[0]

    def get_text(self, response, item):
        news_body = response.xpath("//div[@id='endText']/p/text()").extract()
        if news_body:
            # for  entry in news_body:
            #     print (entry.encode('utf-8'))
            item['news_body'] = news_body

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            print(news_url)
        item['news_url'] = news_url


2.1.3 pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
#encoding: utf-8
import os
def ParseFilePath(url, id):
    # user should change this folder path
    outfolder = "e:\data\FinTech\News\Stocks\%s" % id
    components = url.split("/")
    year = components[3]
    monthday=components[4]
    month = monthday[:2]
    day = monthday[2:]
    idx=components[5]
    page=idx+"_"+components[6]
    #folder = outfolder + "\%s_%s_%s_" % (year, month, day)
    folder = outfolder
    if ((year=='') | ('keywords' in page)):
       filepath='xxx'
    else:
       filepath = folder + "\%s_%s_%s_%s.txt" % (year, month, day, page) 
    filepath=filepath.replace('?', '_')
    return(folder, filepath)

class Stock163Pipeline(object):   
    def process_item(self, item, spider):
        if spider.name != "stocknews":  return item
        if item.get("news_thread", None) is None: return item
                
        url = item['news_url']
        if 'keywords' in url:
           return item
        folder, filepath = ParseFilePath(url, spider.stock_id)
        spider.counter = spider.counter+1
        counterfilepath = folder+"\counter.txt"
        #one a single machine will is virtually no risk of race-condition
        if not os.path.exists(folder):
           os.makedirs(folder)        
        #print(filepath, counterfilepath)
        #print(spider.stats)
        fo = open(counterfilepath, "w", encoding="UTF-8")
        fo.write(str(spider.counter))
        fo.close()

        if (filepath!='xxx'):
           fo = open(filepath, 'w', encoding='utf-8')
           fo.write(str(dict(item)))
           fo.close()
        return None
        

2.2 构造可接受参数的Scrapy爬虫

`class ExampleSpider(CrawlSpider):
    name = "stocknews"
    

    def __init__(self, id='600000', page='0', *args, **kwargs):        
        #allowrule = "/%s/%sd+/d+/*" % (year, month)
        allowrule = "/%s/%s%s/d+/*" % (year, month, day)         
        self.counter = 0
        self.stock_id = id
        self.start_urls = ['http://%s' % (site)]
        ExampleSpider.rules=(Rule(LinkExtractor(allow=allowrule), callback="parse_news", follow=False),)
        #recompile the rule        `

2.3 运行Scrapy爬虫

2.3.1 在命令行运行

2.3.2 在程序中调用

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
for site in ['money.163.com', 'tech.163.com', 'money.163.com/stock']: 
	process.crawl('myspider', site = site)
process.start()

2.4 运行Scrapy的一些要点

3 大规模非结构化数据的存储与分析

4 全部代码

from keras.applications.vgg16 import VGG16
from keras.layers import Input,Flatten,Dense,Dropout
from keras.models import Model
from keras.optimizers import SGD

from keras.datasets import mnist

import cv2
import h5py as h5py
import numpy as np

model_vgg = VGG16(include_top=False,weights='imagenet',input_shape=(ishape,ishape,3))
model = Flatten(name='flatten')(model_vgg.output)
model = Dense(4096,activation='relu',name='fc1')(model)
model = Dense(4096,activation='relu',name='fc2')(model)
model = Dropout(0.5)(model)
model = Dense(10,activation='softmax')(model)
model_vgg_mnist = Model(model_vgg.input,model,name='vgg16')

model_vgg_mnist.summary()

model_vgg = VGG16(include_top=False,weights='imagenet',input_shape=(224,224,3))
for layer in model_vgg.layers:
    layer.trainable=False
model = Flatten()(model_vgg.output)
model = Dense(4096,activation='relu',name='fc1')(model)
model = Dense(4096,activation='relu',name='fc2')(model)
model = Dropout(0.5)(model)
model = Dense(10,activation='softmax',name='prediction')(model)
model_vgg_mnist_pretrain = Model(model_vgg.input,model,name='vgg16_pretrain')

model_vgg_mnist_pretrain.summary()

sgd = SGD(lr = 0.05,decay=1e-5)
model_vgg_mnist_pretrain.compile(loss='categorical_crossentropy',optimizer=sgd,metrics=['accuracy'])

(x_train,y_train),(x_test,y_test) = mnist.load_data()
x_train = [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_train]
x_train = np.concatenate([arr[np.newaxis] for arr in x_train]).astype('float32')
x_test = [cv2.cvtColor(cv2.resize(i,(ishape,ishape)),cv2.COLOR_GRAY2BGR) for i in x_test]
x_test = np.concatenate([arr[np.newaxis] for arr in x_test]).astype('float32')

x_test.shape
x_train.shape

x_train /= 255
x_test /= 255

np.where(x_train[0]!=0)

def tran_y(y):
    y_ohe = np.zeros(10)
    y_ohe[y] = 1
    return y_ohe

y_train_ohe = np.array([tran_y(y_train[i]) for i in range(len(y_train))])
y_test_ohe = np.array([tran_y(y_test[i]) for i in range(len(y_test))])

model_vgg_mnist_pretrain.fit(x_train,y_train_ohe,validation_data=(x_test,y_test_ohe),epochs=200,batch_size=128)

原文地址:https://blog.csdn.net/m0_51797359/article/details/134628496

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任

如若转载,请注明出处:http://www.7code.cn/show_46990.html

如若内容造成侵权/违法违规/事实不符,请联系代码007邮箱suwngjj01@126.com进行投诉反馈,一经查实,立即删除

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注