广告检测流量作弊案例 IDEA运行代码报错 连接不上master主机
检测了环境配置没有问题
编程代码
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.functions._
object Exploreore {
System.setProperty("HADOOP_USER_NAME","cc")
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[4]")
.appName("Explore")
.config("spark.sql.warehouse.dir", "hdfs://master:9000/warehouse")
.config("hive.metastore.uris", "thrift://master:9083")
.enableHiveSupport()
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
// 读取数据
val rawData = spark.read.option("header","true").csv("D:\\idea Class\\class 228\\src\\case_data_new.csv")
// 统计记录数
println("原始数据集行数为:" + rawData.count())
// 统计日流量
rawData.groupBy("dt").count().selectExpr("dt","count as dayCount").sort("dt").show()
// 获取列名并存为List中
val columnName = rawData.columns.toList
// 计算数据字段缺失值
for (i <- columnName){
MissingCount(rawData,i)
}
def MissingCount(data:DataFrame,columnName:String): Unit ={
if (columnName != "creativeid") {
val missingRate = data.select(
columnName).na.drop().count().toDouble / data.count()
println(columnName+" 缺少值比率:" + (1-missingRate)*100 + "%")
}
else{
val creativeidMissing = data.select(columnName).filter(
"creativeid == 0").count() / data.count().toDouble
println(columnName+" 缺少值比率:" + creativeidMissing*100+"%")
}
}
// 脚本刷新网页作弊
// 统计cookie和ip相同的流量记录数
val cookie_ip_distribute = rawData.groupBy(
"ip","cookie").count().withColumn("ip_cookie_count_precent", col(
"count") / rawData.count()*100).orderBy(desc("count"))
cookie_ip_distribute.show(false)
// 统计同一个ip和cookie的浏览次数超过100的记录数
val click_gt_100 = cookie_ip_distribute.filter("count > 100").count()
println("同ip、cookie出现超过100次以上的记录数:" + click_gt_100)
// 定期清除cookie,刷新网页作弊
// 统计每个ip对应的不同cookie次数的分布情况
val ip_distribute = rawData.groupBy("ip").agg(
countDistinct("cookie") as "ip_count").groupBy("ip_count").agg(count(
"ip_count") as "ip_count_count", count(
"ip_count") / rawData.count()*100 as "ip_count_count_precent").orderBy(
desc("ip_count"))
ip_distribute.show(false)
// ADSL重新拨号后刷新网页作弊
// 统计ip前两段相同的记录数的分布情况
val ip_two = rawData.withColumn("ip_two",substring_index(
col("ip"), ".", 2)).groupBy("ip_two").agg(
count("ip_two") as "ip_two_count").orderBy(desc("ip_two_count"))
ip_two.show(false)
// 统计ip前3段相同的记录数的分布情况
val ip_three = rawData.withColumn(
"ip_three",substring_index(col("ip"), ".", 3)).groupBy("ip_three").agg(
count("ip_three") as "ip_three_count").orderBy(desc("ip_three_count"))
ip_three.show(false)
// 删除缺失字符
val data_new = rawData.drop("mac").drop("creativeid").drop(
"mobile_os").drop("mobile_type").drop("app_key_md5").drop(
"app_name_md5").drop("os_type")
data_new.write.mode(SaveMode.Overwrite).saveAsTable("ad_traffic.AdData")
}
}
hive-site.xml配置
```xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://master:3306/hive?createDatabaseIfNotExist=true&useSSL=false</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
<description>password to use against metastore database</description>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>hdfs://master:9000/warehouse</value>
<description>location of default database for the warehouse</description>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://master:9083</value>
</property>
</configuration>
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/d968768ef79142a79c65e0a14de8dc4f.png "#left")
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/5b8c505ea8f64642a663bc5dc7d081e7.png "#left")
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/52ae4e86e25c4422976dea4640cb5ed0.png "#left")
hive metastore服务已启动
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/db2ae98915c84075a3d17d8db33583e4.png "#left")
9083端口能监听到,但是查看进程却没有
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/41a711dc16a8495cb5fa3164cb7d1aa8.png "#left")
亟待解决